[llvm] [LowerBufferFatPointers] Correctly handle alignment modes (PR #134329)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 17:23:13 PDT 2025
https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/134329
Previously, AMDGPULowerBufferFatPointers would emit unaligned buffer loads/stores, even when such unaligned accesses were disabled (that is, on non-HSA platforms).
In addition, the lowering did not respect the newly-added relaxed-buffer-oob-mode feature, which now must be enabled in order to vectorize unaligned loads from buffers.
This commit fixes both issues and adds tests.
>From 4945b6a1c668d949fa550e8a837333c86c0a8b67 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 25 Mar 2025 19:25:16 +0000
Subject: [PATCH] [LowerBufferFatPointers] Correctly handle alignment modes
Previously, AMDGPULowerBufferFatPointers would emit unaligned buffer
loads/stores, even when such unaligned accesses were disabled (that
is, on non-HSA platforms).
In addition, the lowering did not respect the newly-added
relaxed-buffer-oob-mode feature, which now must be enabled in order to
vectorize unaligned loads from buffers.
This commit fixes both issues and adds tests.
---
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 134 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 1689 ++++++---
.../buffer-fat-pointer-atomicrmw-fmax.ll | 1208 +++++--
.../buffer-fat-pointer-atomicrmw-fmin.ll | 1208 +++++--
...ffer-fat-pointers-contents-legalization.ll | 1795 ++++++----
.../AMDGPU/buffer-fat-pointers-memcpy.ll | 370 +-
.../AMDGPU/lower-buffer-fat-pointers-calls.ll | 2 +-
...ointers-contents-legalization-alignment.ll | 3163 +++++++++++++++++
...ffer-fat-pointers-contents-legalization.ll | 974 ++++-
.../lower-buffer-fat-pointers-mem-transfer.ll | 1356 +++++--
...fer-fat-pointers-unoptimized-debug-data.ll | 2 +-
11 files changed, 9069 insertions(+), 2832 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization-alignment.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index e6250ddf2c26b..a17511d71b997 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -250,6 +250,7 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
@@ -688,6 +689,10 @@ class LegalizeBufferContentTypesVisitor
const DataLayout &DL;
+ // Subtarget info, needed for determining what cache control bits to set.
+ const TargetMachine *TM;
+ const GCNSubtarget *ST = nullptr;
+
/// If T is [N x U], where U is a scalar type, return the vector type
/// <N x U>, otherwise, return T.
Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
@@ -696,10 +701,32 @@ class LegalizeBufferContentTypesVisitor
/// Break up the loads of a struct into the loads of its components
+ /// Return the maximum allowed load/store width for the given type and
+ /// alignment combination based on subtarget flags.
+ /// 1. If unaligned accesses are not enabled, then any load/store that is less
+ /// than word-aligned has to be handled one byte or ushort at a time.
+ /// 2. If relaxed OOB mode is not set, we must ensure that the in-bounds
+ /// part of a partially out of bounds read/write is performed correctly. This
+ /// means that any load that isn't naturally aligned has to be split into
+ /// parts that are naturally aligned, so that, after bitcasting, we don't have
+ /// unaligned loads that could discard valid data.
+ ///
+ /// For example, if we're loading a <8 x i8>, that's actually a load of a <2 x
+ /// i32>, and if we load from an align(2) address, that address might be 2
+ /// bytes from the end of the buffer. The hardware will, when performing the
+ /// <2 x i32> load, mask off the entire first word, causing the two in-bounds
+ /// bytes to be masked off.
+ ///
+ /// Unlike the complete disablement of unaligned accesses from point 1,
+ /// this does not apply to unaligned scalars, but will apply to cases like
+ /// `load <2 x i32>, align 4` since the left elemenvt might be out of bounds.
+ uint64_t maxIntrinsicWidth(Type *Ty, Align A);
+
/// Convert a vector or scalar type that can't be operated on by buffer
/// intrinsics to one that would be legal through bitcasts and/or truncation.
- /// Uses the wider of i32, i16, or i8 where possible.
- Type *legalNonAggregateFor(Type *T);
+ /// Uses the wider of i32, i16, or i8 where possible, clamping to the maximum
+ /// allowed width under the alignment rules and subtarget flags.
+ Type *legalNonAggregateForMemOp(Type *T, uint64_t MaxWidth);
Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
@@ -713,8 +740,9 @@ class LegalizeBufferContentTypesVisitor
/// Return the [index, length] pairs into which `T` needs to be cut to form
/// legal buffer load or store operations. Clears `Slices`. Creates an empty
/// `Slices` for non-vector inputs and creates one slice if no slicing will be
- /// needed.
- void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+ /// needed. No slice may be larger than `MaxWidth`.
+ void getVecSlices(Type *T, uint64_t MaxWidth,
+ SmallVectorImpl<VecSlice> &Slices);
Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
@@ -743,8 +771,9 @@ class LegalizeBufferContentTypesVisitor
bool visitStoreInst(StoreInst &SI);
public:
- LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
- : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
+ LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx,
+ const TargetMachine *TM)
+ : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL), TM(TM) {}
bool processFunction(Function &F);
};
} // namespace
@@ -791,7 +820,48 @@ Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
return ArrayRes;
}
-Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+uint64_t LegalizeBufferContentTypesVisitor::maxIntrinsicWidth(Type *T,
+ Align A) {
+ Align Result(16);
+ if (!ST->hasUnalignedBufferAccessEnabled() && A < Align(4))
+ Result = A;
+ auto *VT = dyn_cast<VectorType>(T);
+ if (!ST->hasRelaxedBufferOOBMode() && VT) {
+ TypeSize ElemBits = DL.getTypeSizeInBits(VT->getElementType());
+ if (ElemBits.isKnownMultipleOf(32)) {
+ // Word-sized operations are bounds-checked per word. So, the only case we
+ // have to worry about is stores that start out of bounds and then go in,
+ // and those can only become in-bounds on a multiple of their alignment.
+ // Therefore, we can use the declared alignment of the operation as the
+ // maximum width, rounding up to 4.
+ Result = std::min(Result, std::max(A, Align(4)));
+ } else if (ElemBits.isKnownMultipleOf(8) ||
+ isPowerOf2_64(ElemBits.getKnownMinValue())) {
+ // To ensure correct behavior for sub-word types, we must always scalarize
+ // unaligned loads of sub-word types. For example, if you load
+ // a <4 x i8> from offset 7 in an 8-byte buffer, expecting the vector
+ // to be padded out with 0s after that last byte, you'll get all 0s
+ // instead. To prevent this behavior when not requested, de-vectorize such
+ // loads.
+ //
+ // This condition could be looser and mirror the word-length condition
+ // if we were allowed to assume that the number of records in a buffer
+ // was a multiple of 4 - then, we could always use the vector's
+ // alignment of the access on the assumption that no one wants their
+ // mask to kick in mid-word.
+ //
+ // Strict OOB checking isn't supported if the size of each element is a
+ // non-power-of-2 value less than 8, since there's no feasible way to
+ // apply such a strict bounds check.
+ Result =
+ commonAlignment(Result, divideCeil(ElemBits.getKnownMinValue(), 8));
+ }
+ }
+ return Result.value() * 8;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateForMemOp(
+ Type *T, uint64_t MaxWidth) {
TypeSize Size = DL.getTypeStoreSizeInBits(T);
// Implicitly zero-extend to the next byte if needed
if (!DL.typeSizeEqualsStoreSize(T))
@@ -803,15 +873,16 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
return T;
}
unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
- if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+ if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= MaxWidth) {
// [vectors of] anything that's 16/32/64/128 bits can be cast and split into
- // legal buffer operations.
+ // legal buffer operations, except that we might need to cut them into
+ // smaller values if we're not allowed to do unaligned vector loads.
return T;
}
Type *BestVectorElemType = nullptr;
- if (Size.isKnownMultipleOf(32))
+ if (Size.isKnownMultipleOf(32) && MaxWidth >= 32)
BestVectorElemType = IRB.getInt32Ty();
- else if (Size.isKnownMultipleOf(16))
+ else if (Size.isKnownMultipleOf(16) && MaxWidth >= 16)
BestVectorElemType = IRB.getInt16Ty();
else
BestVectorElemType = IRB.getInt8Ty();
@@ -884,7 +955,7 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
}
void LegalizeBufferContentTypesVisitor::getVecSlices(
- Type *T, SmallVectorImpl<VecSlice> &Slices) {
+ Type *T, uint64_t MaxWidth, SmallVectorImpl<VecSlice> &Slices) {
Slices.clear();
auto *VT = dyn_cast<FixedVectorType>(T);
if (!VT)
@@ -905,8 +976,8 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
uint64_t TotalElems = VT->getNumElements();
uint64_t Index = 0;
- auto TrySlice = [&](unsigned MaybeLen) {
- if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+ auto TrySlice = [&](unsigned MaybeLen, unsigned Width) {
+ if (MaybeLen > 0 && Width <= MaxWidth && Index + MaybeLen <= TotalElems) {
VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
Slices.push_back(Slice);
Index += MaybeLen;
@@ -915,9 +986,9 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
return false;
};
while (Index < TotalElems) {
- TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
- TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
- TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+ TrySlice(ElemsPer4Words, 128) || TrySlice(ElemsPer3Words, 96) ||
+ TrySlice(ElemsPer2Words, 64) || TrySlice(ElemsPerWord, 32) ||
+ TrySlice(ElemsPerShort, 16) || TrySlice(ElemsPerByte, 8);
}
}
@@ -1004,11 +1075,13 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
// Typical case
+ Align PartAlign = commonAlignment(OrigLI.getAlign(), AggByteOff);
Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ uint64_t MaxWidth = maxIntrinsicWidth(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, MaxWidth);
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, MaxWidth, Slices);
bool HasSlices = Slices.size() > 1;
bool IsAggPart = !AggIdxs.empty();
Value *LoadsRes;
@@ -1045,7 +1118,8 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
Value *NewPtr = IRB.CreateGEP(
IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
- GEPNoWrapFlags::noUnsignedWrap());
+ ST->hasRelaxedBufferOOBMode() ? GEPNoWrapFlags::noUnsignedWrap()
+ : GEPNoWrapFlags::none());
Type *LoadableType = intrinsicTypeFor(SliceType);
LoadInst *NewLI = IRB.CreateAlignedLoad(
LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
@@ -1134,13 +1208,15 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
NewData = arrayToVector(NewData, ArrayAsVecType, Name);
}
- Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ Align PartAlign = commonAlignment(OrigSI.getAlign(), AggByteOff);
+ uint64_t MaxWidth = maxIntrinsicWidth(ArrayAsVecType, PartAlign);
+ Type *LegalType = legalNonAggregateForMemOp(ArrayAsVecType, MaxWidth);
if (LegalType != ArrayAsVecType) {
NewData = makeLegalNonAggregate(NewData, LegalType, Name);
}
SmallVector<VecSlice> Slices;
- getVecSlices(LegalType, Slices);
+ getVecSlices(LegalType, MaxWidth, Slices);
bool NeedToSplit = Slices.size() > 1 || IsAggPart;
if (!NeedToSplit) {
Type *StorableType = intrinsicTypeFor(LegalType);
@@ -1161,10 +1237,11 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
Type *SliceType =
S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
- Value *NewPtr =
- IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
- OrigPtr->getName() + ".part." + Twine(S.Index),
- GEPNoWrapFlags::noUnsignedWrap());
+ Value *NewPtr = IRB.CreateGEP(
+ IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".part." + Twine(S.Index),
+ ST->hasRelaxedBufferOOBMode() ? GEPNoWrapFlags::noUnsignedWrap()
+ : GEPNoWrapFlags::none());
Value *DataSlice = extractSlice(NewData, S, Name);
Type *StorableType = intrinsicTypeFor(SliceType);
DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
@@ -1193,6 +1270,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
}
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+ ST = &TM->getSubtarget<GCNSubtarget>(F);
bool Changed = false;
// Note, memory transfer intrinsics won't
for (Instruction &I : make_early_inc_range(instructions(F))) {
@@ -2438,8 +2516,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
M.getContext(), &TM);
- LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
- M.getContext());
+ LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(
+ DL, M.getContext(), &TM);
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
bool BodyChanges = containsBufferFatPointers(F, &StructTM);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index b66ee994ce7ee..0532b82caf422 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6254,19 +6254,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v1, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v0, v0, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6287,12 +6294,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
@@ -6324,13 +6336,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
@@ -6351,13 +6367,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
@@ -6379,17 +6399,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6425,17 +6446,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6503,17 +6525,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v2
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6534,12 +6561,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -6579,13 +6611,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v2, v1, v2
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
@@ -6607,17 +6643,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6653,17 +6690,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6780,24 +6818,43 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v7, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v8, v7, v6, 0x5040100
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB21_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v7, v8, v5
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v7
; GFX11-NEXT: v_mov_b32_e32 v7, v8
-; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -6812,8 +6869,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB21_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
@@ -6823,8 +6880,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB21_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB21_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6844,23 +6901,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB21_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v8, v7, v6, 0x5040100
+; GFX10-NEXT: .LBB21_5: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB21_6 Depth 2
; GFX10-NEXT: v_pk_add_f16 v7, v8, v5
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v6, v7
; GFX10-NEXT: v_mov_b32_e32 v7, v8
-; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX10-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -6874,8 +6948,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB21_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
@@ -6885,8 +6959,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB21_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB21_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -6932,22 +7006,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v8, v7, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB21_6 Depth 2
; GFX908-NEXT: v_pk_add_f16 v7, v8, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v7, v8
-; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX908-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -6960,8 +7052,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB21_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
@@ -6969,8 +7061,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: v_mov_b32_e32 v8, v6
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB21_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v6
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -6990,24 +7082,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB21_6 Depth 2
; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v7, v8
-; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX8-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -7020,8 +7130,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB21_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
@@ -7029,8 +7139,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_mov_b32_e32 v8, v6
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB21_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -7049,24 +7159,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB21_6 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -7082,7 +7207,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX7-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -7095,8 +7220,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB21_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -7106,8 +7231,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB21_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
@@ -7127,24 +7252,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB21_6 Depth 2
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -7161,7 +7301,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX6-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -7174,8 +7314,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB21_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -7185,8 +7325,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB21_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v1, v5
@@ -7225,19 +7365,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v1, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v0, v0, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -7258,12 +7405,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
@@ -7286,13 +7438,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
@@ -7312,13 +7468,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
@@ -7339,13 +7499,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
@@ -7367,17 +7531,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7413,17 +7578,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7491,17 +7657,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v2
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -7522,12 +7693,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7549,13 +7725,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v3, v3, 16, v1
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
@@ -7574,13 +7754,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v2, v3, 16, v1
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
@@ -7600,13 +7784,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v2, v1, v2
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
@@ -7628,17 +7816,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7674,17 +7863,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7750,19 +7940,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s16
+; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v1, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v0, v0, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -7783,12 +7980,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
@@ -7811,13 +8013,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
@@ -7837,13 +8043,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v0, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
; GFX908-NEXT: v_mov_b32_e32 v0, v4
@@ -7864,13 +8074,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v1, v0, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
@@ -7892,17 +8106,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7938,17 +8153,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8016,17 +8232,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v2
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -8047,12 +8268,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8074,13 +8300,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v3, v3, 16, v1
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
@@ -8099,13 +8329,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v2, v3, 16, v1
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v2
; GFX908-NEXT: v_mov_b32_e32 v4, v1
@@ -8125,13 +8359,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v2, v1, v2
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
@@ -8153,17 +8391,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8199,17 +8438,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8269,19 +8509,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -8314,41 +8557,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -8367,17 +8615,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
@@ -8412,19 +8664,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -8456,19 +8711,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -8501,17 +8759,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -8548,16 +8809,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8590,16 +8853,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8653,17 +8918,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
@@ -8696,24 +8965,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -8747,14 +9023,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -8790,17 +9071,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
@@ -8833,17 +9118,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
@@ -8877,15 +9166,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
@@ -8922,15 +9215,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
@@ -8964,15 +9259,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
@@ -9057,21 +9354,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_ushort v6, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v1
+; GFX942-NEXT: v_readfirstlane_b32 s6, v2
+; GFX942-NEXT: v_readfirstlane_b32 s7, v3
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT: buffer_load_ushort v7, v4, s[4:7], 0 offen offset:1026
+; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB28_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_perm_b32 v7, v7, v6, s0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
-; GFX942-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX942-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
-; GFX942-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ; Child Loop BB28_6 Depth 2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX942-NEXT: v_add_f32_e32 v4, v4, v9
; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -9091,7 +9406,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX942-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
; GFX942-NEXT: v_readfirstlane_b32 s5, v1
@@ -9105,8 +9420,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB28_4
-; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX942-NEXT: s_cbranch_execnz .LBB28_6
+; GFX942-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -9114,8 +9429,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB28_3
-; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX942-NEXT: s_cbranch_execnz .LBB28_5
+; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -9137,28 +9452,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB28_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v8, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB28_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB28_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -9173,7 +9507,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX11-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -9188,8 +9522,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB28_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -9199,8 +9533,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB28_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB28_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
@@ -9221,19 +9555,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB28_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX10-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; Child Loop BB28_6 Depth 2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
@@ -9253,7 +9604,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX10-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -9267,8 +9618,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB28_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB28_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -9278,8 +9629,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB28_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB28_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -9299,21 +9650,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_3
+; GFX90A-NEXT: ; %bb.4:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b32 s4, 0x5040100
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_perm_b32 v7, v7, v6, s4
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ; Child Loop BB28_6 Depth 2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -9331,7 +9700,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX90A-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
@@ -9344,8 +9713,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_6
+; GFX90A-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -9353,8 +9722,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_5
+; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -9374,21 +9743,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB28_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB28_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v6, v8, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
-; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB28_6 Depth 2
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX908-NEXT: v_add_f32_e32 v4, v4, v8
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -9407,7 +9794,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX908-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -9420,8 +9807,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB28_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -9429,8 +9816,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB28_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -9440,7 +9827,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
; GFX8-NEXT: v_readfirstlane_b32 s10, v2
@@ -9450,19 +9853,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
; GFX8-NEXT: ; implicit-def: $vgpr4
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_1
-; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_cbranch_execnz .LBB28_3
+; GFX8-NEXT: ; %bb.4:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB28_6 Depth 2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -9484,7 +9889,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX8-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -9497,8 +9902,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB28_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -9506,8 +9911,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB28_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -9526,23 +9931,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB28_6 Depth 2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
@@ -9556,7 +9976,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX7-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -9569,8 +9989,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB28_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -9579,8 +9999,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB28_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v4
@@ -9600,23 +10020,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB28_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB28_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: s_mov_b64 s[6:7], 0
; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB28_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB28_6 Depth 2
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
@@ -9630,7 +10065,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX6-NEXT: .LBB28_6: ; Parent Loop BB28_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -9643,8 +10078,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB28_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB28_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -9654,8 +10089,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB28_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v4
@@ -9683,19 +10118,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -9728,41 +10166,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -9781,17 +10224,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
@@ -9826,19 +10273,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -9870,19 +10320,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -9915,17 +10368,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -9962,16 +10418,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10004,16 +10462,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10067,17 +10527,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
@@ -10110,24 +10574,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -10161,14 +10632,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -10204,17 +10680,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
@@ -10247,17 +10727,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
@@ -10291,15 +10775,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
@@ -10336,15 +10824,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
@@ -10378,15 +10868,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
@@ -10440,19 +10932,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -10485,41 +10980,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -10538,17 +11038,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
@@ -10583,19 +11087,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -10627,19 +11134,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -10672,17 +11182,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -10719,16 +11232,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10761,16 +11276,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10824,17 +11341,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
@@ -10867,24 +11388,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -10918,14 +11446,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -10961,17 +11494,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11004,17 +11541,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11048,15 +11589,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11093,15 +11638,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
@@ -11135,15 +11682,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
@@ -11198,17 +11747,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11241,24 +11794,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -11292,14 +11852,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -11335,17 +11900,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11378,17 +11947,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11422,15 +11995,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
@@ -11467,15 +12044,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
@@ -11509,15 +12088,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index cb557c62c206c..5b135b2bc3295 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -5357,23 +5357,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5390,16 +5394,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
+; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v3, s6
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_pk_max_f16 v0, v5, v5
; GFX942-NEXT: buffer_wbl2 sc1
@@ -5420,22 +5427,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v3
; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v4, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -5453,16 +5465,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
@@ -5484,16 +5500,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
+; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2
@@ -5512,16 +5531,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
+; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_max_f16 v0, v5, v5
; GFX908-NEXT: v_pk_max_f16 v4, v0, v2
@@ -5541,17 +5563,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
+; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v1, v6, v6
@@ -5575,17 +5600,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5621,17 +5647,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5679,18 +5706,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5709,14 +5742,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX942-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v3, s6
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_pk_max_f16 v0, v1, v1
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: v_pk_max_f16 v0, v0, v2
@@ -5739,17 +5776,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -5770,13 +5814,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
@@ -5799,14 +5848,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
@@ -5826,14 +5879,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
@@ -5854,15 +5911,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -5886,17 +5947,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5932,17 +5994,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6002,26 +6065,47 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_u16 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v8, v4, s[4:7], null offen offset:1026
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB18_3
+; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX12-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; Child Loop BB18_6 Depth 2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -6036,8 +6120,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-NEXT: s_cbranch_execnz .LBB18_6
+; GFX12-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6046,8 +6130,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB18_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_cbranch_execnz .LBB18_5
+; GFX12-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -6067,24 +6151,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_ushort v6, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v1
+; GFX942-NEXT: v_readfirstlane_b32 s6, v2
+; GFX942-NEXT: v_readfirstlane_b32 s7, v3
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT: buffer_load_ushort v7, v4, s[4:7], 0 offen offset:1026
+; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB18_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_perm_b32 v7, v7, v6, s0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_pk_max_f16 v9, v5, v5
-; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX942-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
-; GFX942-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ; Child Loop BB18_6 Depth 2
; GFX942-NEXT: v_pk_max_f16 v4, v7, v7
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_max_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX942-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
; GFX942-NEXT: v_readfirstlane_b32 s5, v1
@@ -6098,8 +6200,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB18_4
-; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX942-NEXT: s_cbranch_execnz .LBB18_6
+; GFX942-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6107,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB18_3
-; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX942-NEXT: s_cbranch_execnz .LBB18_5
+; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -6130,26 +6232,46 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v8, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB18_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-NEXT: v_pk_max_f16 v8, v5, v5
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB18_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v5, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -6164,8 +6286,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB18_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6175,8 +6297,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB18_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB18_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6196,25 +6318,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB18_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX10-NEXT: v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX10-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; Child Loop BB18_6 Depth 2
; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v5, v4, v8
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX10-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -6228,8 +6367,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB18_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6239,8 +6378,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB18_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB18_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -6260,23 +6399,41 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
+; GFX90A-NEXT: ; %bb.4:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b32 s4, 0x5040100
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_perm_b32 v7, v7, v6, s4
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ; Child Loop BB18_6 Depth 2
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX90A-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
@@ -6289,8 +6446,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_6
+; GFX90A-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6298,8 +6455,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_5
+; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6319,24 +6476,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB18_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v6, v8, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB18_6 Depth 2
; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
; GFX908-NEXT: v_pk_max_f16 v5, v4, v8
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX908-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -6349,8 +6524,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB18_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB18_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6358,8 +6533,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB18_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB18_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -6379,19 +6554,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB18_6 Depth 2
; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -6400,7 +6593,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX8-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -6413,8 +6606,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB18_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB18_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6422,8 +6615,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB18_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB18_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -6442,24 +6635,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB18_6 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -6475,7 +6683,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX7-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -6488,8 +6696,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB18_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -6499,8 +6707,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB18_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
@@ -6520,24 +6728,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB18_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB18_6 Depth 2
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -6554,7 +6777,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX6-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -6567,8 +6790,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB18_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -6578,8 +6801,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB18_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v1, v5
@@ -6602,41 +6825,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v4
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v6, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6653,19 +6881,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -6698,41 +6929,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6751,17 +6987,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
@@ -6796,19 +7036,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -6840,19 +7083,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -6885,17 +7131,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -6932,16 +7181,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6974,16 +7225,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7026,22 +7279,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7073,17 +7333,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_max_f32_e32 v0, v0, v2
@@ -7116,24 +7380,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7167,14 +7438,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7210,17 +7486,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
@@ -7253,17 +7533,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
@@ -7297,15 +7581,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
@@ -7342,15 +7630,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7384,15 +7674,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7450,27 +7742,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_u16 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v8, v4, s[4:7], null offen offset:1026
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; Child Loop BB21_6 Depth 2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -7487,7 +7799,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -7502,8 +7814,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-NEXT: s_cbranch_execnz .LBB21_6
+; GFX12-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7512,8 +7824,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB21_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_cbranch_execnz .LBB21_5
+; GFX12-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -7533,21 +7845,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_ushort v6, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v1
+; GFX942-NEXT: v_readfirstlane_b32 s6, v2
+; GFX942-NEXT: v_readfirstlane_b32 s7, v3
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT: buffer_load_ushort v7, v4, s[4:7], 0 offen offset:1026
+; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB21_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_perm_b32 v7, v7, v6, s0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
-; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX942-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
-; GFX942-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ; Child Loop BB21_6 Depth 2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX942-NEXT: v_max_f32_e32 v4, v4, v9
; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7567,7 +7897,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX942-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
; GFX942-NEXT: v_readfirstlane_b32 s5, v1
@@ -7581,8 +7911,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB21_4
-; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX942-NEXT: s_cbranch_execnz .LBB21_6
+; GFX942-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7590,8 +7920,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB21_3
-; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX942-NEXT: s_cbranch_execnz .LBB21_5
+; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -7613,28 +7943,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v8, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB21_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -7649,7 +7998,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -7664,8 +8013,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB21_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7675,8 +8024,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB21_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB21_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
@@ -7697,19 +8046,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB21_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX10-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; Child Loop BB21_6 Depth 2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
@@ -7729,7 +8095,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX10-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -7743,8 +8109,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB21_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7754,8 +8120,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB21_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB21_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -7775,21 +8141,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
+; GFX90A-NEXT: ; %bb.4:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b32 s4, 0x5040100
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_perm_b32 v7, v7, v6, s4
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ; Child Loop BB21_6 Depth 2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7807,7 +8191,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX90A-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
@@ -7820,8 +8204,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_6
+; GFX90A-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7829,8 +8213,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_5
+; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7850,21 +8234,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v6, v8, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
-; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB21_6 Depth 2
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX908-NEXT: v_max_f32_e32 v4, v4, v8
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7883,7 +8285,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX908-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -7896,8 +8298,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB21_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7905,8 +8307,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB21_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -7926,19 +8328,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB21_6 Depth 2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7960,7 +8380,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX8-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -7973,8 +8393,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB21_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7982,8 +8402,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB21_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -8002,23 +8422,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB21_6 Depth 2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
@@ -8032,7 +8467,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX7-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -8045,8 +8480,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB21_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -8055,8 +8490,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB21_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v4
@@ -8076,23 +8511,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: s_mov_b64 s[6:7], 0
; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB21_6 Depth 2
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
@@ -8106,7 +8556,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX6-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -8119,8 +8569,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB21_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -8130,8 +8580,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB21_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index fea674a100b99..73c11908fa9ad 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -5357,23 +5357,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v5, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5390,16 +5394,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
+; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v3, s6
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_pk_max_f16 v0, v5, v5
; GFX942-NEXT: buffer_wbl2 sc1
@@ -5420,22 +5427,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v3
; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -5453,16 +5465,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
@@ -5484,16 +5500,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
+; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2
@@ -5512,16 +5531,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
+; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
; GFX908-NEXT: v_pk_max_f16 v0, v5, v5
; GFX908-NEXT: v_pk_min_f16 v4, v0, v2
@@ -5541,17 +5563,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
+; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v1, v6, v6
@@ -5575,17 +5600,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5621,17 +5647,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5679,18 +5706,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5709,14 +5742,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX942-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v3, s6
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_pk_max_f16 v0, v1, v1
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: v_pk_min_f16 v0, v0, v2
@@ -5739,17 +5776,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -5770,13 +5814,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
@@ -5799,14 +5848,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v3, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
@@ -5826,14 +5879,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v3, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v4, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
; GFX908-NEXT: v_pk_min_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
@@ -5854,15 +5911,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -5886,17 +5947,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5932,17 +5994,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1026
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6002,26 +6065,47 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_u16 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v8, v4, s[4:7], null offen offset:1026
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB18_3
+; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5
; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX12-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; Child Loop BB18_6 Depth 2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -6036,8 +6120,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-NEXT: s_cbranch_execnz .LBB18_6
+; GFX12-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6046,8 +6130,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB18_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_cbranch_execnz .LBB18_5
+; GFX12-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -6067,24 +6151,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_ushort v6, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v1
+; GFX942-NEXT: v_readfirstlane_b32 s6, v2
+; GFX942-NEXT: v_readfirstlane_b32 s7, v3
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT: buffer_load_ushort v7, v4, s[4:7], 0 offen offset:1026
+; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB18_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_perm_b32 v7, v7, v6, s0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_pk_max_f16 v9, v5, v5
-; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX942-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
-; GFX942-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ; Child Loop BB18_6 Depth 2
; GFX942-NEXT: v_pk_max_f16 v4, v7, v7
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_min_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX942-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
; GFX942-NEXT: v_readfirstlane_b32 s5, v1
@@ -6098,8 +6200,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB18_4
-; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX942-NEXT: s_cbranch_execnz .LBB18_6
+; GFX942-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6107,8 +6209,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB18_3
-; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX942-NEXT: s_cbranch_execnz .LBB18_5
+; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -6130,26 +6232,46 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v8, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB18_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-NEXT: v_pk_max_f16 v8, v5, v5
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB18_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_min_f16 v5, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -6164,8 +6286,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB18_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6175,8 +6297,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB18_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB18_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -6196,25 +6318,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB18_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX10-NEXT: v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX10-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; Child Loop BB18_6 Depth 2
; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_min_f16 v5, v4, v8
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX10-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -6228,8 +6367,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB18_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB18_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6239,8 +6378,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB18_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB18_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -6260,23 +6399,41 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
+; GFX90A-NEXT: ; %bb.4:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b32 s4, 0x5040100
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_perm_b32 v7, v7, v6, s4
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ; Child Loop BB18_6 Depth 2
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX90A-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
@@ -6289,8 +6446,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_6
+; GFX90A-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6298,8 +6455,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_5
+; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -6319,24 +6476,42 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB18_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v6, v8, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB18_6 Depth 2
; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
; GFX908-NEXT: v_pk_min_f16 v5, v4, v8
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX908-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -6349,8 +6524,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB18_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB18_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6358,8 +6533,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB18_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB18_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -6379,19 +6554,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB18_6 Depth 2
; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -6400,7 +6593,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX8-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -6413,8 +6606,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB18_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB18_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -6422,8 +6615,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB18_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB18_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -6442,24 +6635,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB18_6 Depth 2
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[12:13], exec
@@ -6475,7 +6683,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v8, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX7-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -6488,8 +6696,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB18_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -6499,8 +6707,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB18_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v4
; GFX7-NEXT: v_mov_b32_e32 v1, v5
@@ -6520,24 +6728,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB18_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB18_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v11, v5
+; GFX6-NEXT: s_waitcnt vmcnt(1)
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v8
; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v11
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB18_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB18_6 Depth 2
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: s_mov_b64 s[12:13], exec
@@ -6554,7 +6777,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
; GFX6-NEXT: v_mov_b32_e32 v8, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX6-NEXT: .LBB18_6: ; Parent Loop BB18_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -6567,8 +6790,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB18_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB18_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
@@ -6578,8 +6801,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB18_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v1, v5
@@ -6602,41 +6825,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v4
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v6, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6653,19 +6881,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -6698,41 +6929,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6751,17 +6987,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s20
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
@@ -6796,19 +7036,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
@@ -6840,19 +7083,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s20
-; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: v_mov_b32_e32 v1, s20
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -6885,17 +7131,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
@@ -6932,16 +7181,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6974,16 +7225,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v4, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v5, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7026,22 +7279,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_load_u16 v2, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: buffer_load_u16 v1, v1, s[0:3], null offen offset:1026
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7073,17 +7333,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:1026
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
-; GFX942-NEXT: s_mov_b64 s[6:7], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: s_mov_b64 s[6:7], 0
+; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX942-NEXT: v_min_f32_e32 v0, v0, v2
@@ -7116,24 +7380,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_u16 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_u16 v1, v1, s[0:3], 0 offen offset:1026
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7167,14 +7438,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_ushort v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -7210,17 +7486,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
@@ -7253,17 +7533,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_ushort v5, v1, s[16:19], 0 offen offset:1026
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: s_waitcnt vmcnt(1)
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshl_or_b32 v1, v5, 16, v0
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
@@ -7297,15 +7581,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_ushort v4, v1, s[16:19], 0 offen offset:1026
+; GFX8-NEXT: buffer_load_ushort v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
@@ -7342,15 +7630,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX7-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7384,15 +7674,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_ushort v3, v2, s[16:19], 0 offen offset:1026
+; GFX6-NEXT: buffer_load_ushort v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
@@ -7450,27 +7742,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: buffer_load_u16 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v8, v4, s[4:7], null offen offset:1026
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-NEXT: ; %bb.4:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; Child Loop BB21_6 Depth 2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -7487,7 +7799,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -7502,8 +7814,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-NEXT: s_cbranch_execnz .LBB21_6
+; GFX12-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7512,8 +7824,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB21_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_cbranch_execnz .LBB21_5
+; GFX12-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -7533,21 +7845,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: buffer_load_ushort v6, v4, s[4:7], 0 offen offset:1024
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v1
+; GFX942-NEXT: v_readfirstlane_b32 s6, v2
+; GFX942-NEXT: v_readfirstlane_b32 s7, v3
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT: buffer_load_ushort v7, v4, s[4:7], 0 offen offset:1026
+; GFX942-NEXT: ; implicit-def: $vgpr4
+; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB21_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_mov_b32 s0, 0x5040100
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_perm_b32 v7, v7, v6, s0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX942-NEXT: s_movk_i32 s10, 0x7fff
; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX942-NEXT: s_mov_b32 s11, 0x7060302
-; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX942-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Loop Header: Depth=1
-; GFX942-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ; Child Loop BB21_6 Depth 2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX942-NEXT: v_min_f32_e32 v4, v4, v9
; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7567,7 +7897,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX942-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
; GFX942-NEXT: v_readfirstlane_b32 s5, v1
@@ -7581,8 +7911,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB21_4
-; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX942-NEXT: s_cbranch_execnz .LBB21_6
+; GFX942-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7590,8 +7920,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB21_3
-; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX942-NEXT: s_cbranch_execnz .LBB21_5
+; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -7613,28 +7943,47 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: buffer_load_u16 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_1
; GFX11-NEXT: ; %bb.2:
; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_u16 v8, v4, s[4:7], 0 offen offset:1026
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; Child Loop BB21_6 Depth 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -7649,7 +7998,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX11-NEXT: v_mov_b32_e32 v4, v5
; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s5, v1
@@ -7664,8 +8013,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-NEXT: s_cbranch_execnz .LBB21_6
+; GFX11-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7675,8 +8024,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB21_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_cbranch_execnz .LBB21_5
+; GFX11-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v4
@@ -7697,19 +8046,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
; GFX10-NEXT: ; %bb.2:
; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB21_3
+; GFX10-NEXT: ; %bb.4:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v6, v8, v6, 0x5040100
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX10-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; Child Loop BB21_6 Depth 2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
@@ -7729,7 +8095,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX10-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -7743,8 +8109,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB21_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX10-NEXT: s_cbranch_execnz .LBB21_6
+; GFX10-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7754,8 +8120,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB21_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_cbranch_execnz .LBB21_5
+; GFX10-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -7775,21 +8141,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1026
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
+; GFX90A-NEXT: ; %bb.4:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b32 s4, 0x5040100
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_perm_b32 v7, v7, v6, s4
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ; Child Loop BB21_6 Depth 2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7807,7 +8191,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX90A-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
@@ -7820,8 +8204,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_6
+; GFX90A-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7829,8 +8213,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB21_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_5
+; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7850,21 +8234,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2:
; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_3
+; GFX908-NEXT: ; %bb.4:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b32 s4, 0x5040100
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_perm_b32 v6, v8, v6, s4
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX908-NEXT: s_movk_i32 s14, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GFX908-NEXT: s_mov_b32 s15, 0x7060302
-; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX908-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ; Child Loop BB21_6 Depth 2
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX908-NEXT: v_min_f32_e32 v4, v4, v8
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7883,7 +8285,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX908-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
; GFX908-NEXT: v_readfirstlane_b32 s9, v1
@@ -7896,8 +8298,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX908-NEXT: s_cbranch_execnz .LBB21_6
+; GFX908-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7905,8 +8307,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_cbranch_execnz .LBB21_5
+; GFX908-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
@@ -7926,19 +8328,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: buffer_load_ushort v6, v4, s[8:11], 0 offen offset:1024
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_ushort v8, v4, s[8:11], 0 offen offset:1026
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX8-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: ; Child Loop BB21_6 Depth 2
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX8-NEXT: v_min_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7960,7 +8380,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX8-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
; GFX8-NEXT: v_readfirstlane_b32 s9, v1
@@ -7973,8 +8393,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX8-NEXT: s_cbranch_execnz .LBB21_6
+; GFX8-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7982,8 +8402,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB21_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_cbranch_execnz .LBB21_5
+; GFX8-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -8002,23 +8422,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_3
+; GFX7-NEXT: ; %bb.4:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX7-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX7-NEXT: ; Child Loop BB21_6 Depth 2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
@@ -8032,7 +8467,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX7-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
; GFX7-NEXT: v_readfirstlane_b32 s9, v1
@@ -8045,8 +8480,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX7-NEXT: s_cbranch_execnz .LBB21_6
+; GFX7-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -8055,8 +8490,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB21_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_cbranch_execnz .LBB21_5
+; GFX7-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v4
@@ -8076,23 +8511,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: buffer_load_ushort v7, v4, s[8:11], 0 offen offset:1024
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB21_3: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_ushort v9, v4, s[8:11], 0 offen offset:1026
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_3
+; GFX6-NEXT: ; %bb.4:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v9
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: s_mov_b64 s[6:7], 0
; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX6-NEXT: .LBB21_5: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX6-NEXT: ; Child Loop BB21_6 Depth 2
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
@@ -8106,7 +8556,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX6-NEXT: .LBB21_6: ; Parent Loop BB21_5 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
; GFX6-NEXT: v_readfirstlane_b32 s9, v1
@@ -8119,8 +8569,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX6-NEXT: s_cbranch_execnz .LBB21_6
+; GFX6-NEXT: ; %bb.7: ; in Loop: Header=BB21_5 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -8130,8 +8580,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB21_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_cbranch_execnz .LBB21_5
+; GFX6-NEXT: ; %bb.8: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 405058b24dcc2..1582d460f5285 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -354,15 +354,23 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v2i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v2i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <2 x i16>, ptr addrspace(7) %p
@@ -373,14 +381,18 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v2i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v2i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -392,15 +404,35 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v4i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v4i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <4 x i16>, ptr addrspace(7) %p
@@ -411,14 +443,24 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v4i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v4i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -430,15 +472,59 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v8i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:14
+; SDAG-NEXT: s_waitcnt vmcnt(7)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(6)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v1, v5, 16, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v3, v7, 16, v3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v8i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:14
+; GISEL-NEXT: s_waitcnt vmcnt(7)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(6)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v1
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v3
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <8 x i16>, ptr addrspace(7) %p
@@ -449,14 +535,36 @@ define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v8i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:14
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v8i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:14
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -582,15 +690,23 @@ define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v2f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <2 x half>, ptr addrspace(7) %p
@@ -601,14 +717,18 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v2f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -620,15 +740,35 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v4bf16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v4bf16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <4 x bfloat>, ptr addrspace(7) %p
@@ -639,14 +779,24 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v4bf16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v4bf16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -658,15 +808,59 @@ define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v8f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:14
+; SDAG-NEXT: s_waitcnt vmcnt(7)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(6)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v1, v5, 16, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v3, v7, 16, v3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v8f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:14
+; GISEL-NEXT: s_waitcnt vmcnt(7)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(6)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v1
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v3
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <8 x half>, ptr addrspace(7) %p
@@ -677,14 +871,36 @@ define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v8f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:14
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v8f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:14
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1306,15 +1522,47 @@ define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v6f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:10
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v2, v5, 16, v2
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v6f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:10
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <6 x half>, ptr addrspace(7) %p
@@ -1325,14 +1573,30 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v6f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v6f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1684,16 +1948,26 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v3i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v3i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:2
; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1705,7 +1979,9 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v3i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1713,7 +1989,9 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v3i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:2
; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1726,20 +2004,38 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v5i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v5i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1751,7 +2047,12 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v5i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1759,7 +2060,12 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v5i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:6
; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1772,15 +2078,47 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v6i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:10
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshl_or_b32 v2, v5, 16, v2
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v6i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:10
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <6 x i16>, ptr addrspace(7) %p
@@ -1791,14 +2129,30 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v6i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v6i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1810,21 +2164,50 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v7i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(6)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v1, v5, 16, v1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v7i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:10
; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(6)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_lshl_or_b32 v0, v4, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v1
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1836,7 +2219,15 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v7i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
; SDAG-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1844,7 +2235,15 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v7i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_short v5, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_store_short v6, off, s[16:19], 0 offset:10
; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1857,22 +2256,62 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v9i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ushort v8, off, s[16:19], 0 offset:14
; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(8)
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: s_waitcnt vmcnt(7)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: s_waitcnt vmcnt(6)
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: s_waitcnt vmcnt(5)
+; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT: s_waitcnt vmcnt(4)
+; SDAG-NEXT: v_lshl_or_b32 v0, v5, 16, v0
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: v_lshl_or_b32 v1, v6, 16, v1
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshl_or_b32 v2, v7, 16, v2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshl_or_b32 v3, v8, 16, v3
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v9i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_load_ushort v5, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ushort v7, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ushort v8, off, s[16:19], 0 offset:14
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(8)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_waitcnt vmcnt(7)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: s_waitcnt vmcnt(6)
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: s_waitcnt vmcnt(5)
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GISEL-NEXT: s_waitcnt vmcnt(4)
+; GISEL-NEXT: v_lshl_or_b32 v0, v5, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: v_lshl_or_b32 v1, v6, 16, v1
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshl_or_b32 v2, v7, 16, v2
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
-; GISEL-NEXT: v_bfi_b32 v3, s4, v3, v3
+; GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v3
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1884,7 +2323,18 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v9i16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:14
; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1892,7 +2342,18 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v9i16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v5, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_short v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_store_short v7, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_store_short v8, off, s[16:19], 0 offset:14
; GISEL-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1947,17 +2408,17 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v2i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v2i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <2 x i8>, ptr addrspace(7) %p
@@ -1968,19 +2429,16 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v2i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v2i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1992,20 +2450,18 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v3i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v3i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2017,9 +2473,8 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v3i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2027,10 +2482,8 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v3i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2043,21 +2496,21 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v4i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v4i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <4 x i8>, ptr addrspace(7) %p
@@ -2068,28 +2521,20 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v4i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v4i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v5, 8
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2101,24 +2546,22 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v5i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v5i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2130,12 +2573,10 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v5i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2143,16 +2584,10 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v5i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v6, 8
-; GISEL-NEXT: v_mov_b32_e32 v5, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v5, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2165,30 +2600,25 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v6i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
-; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mov_b32_e32 v1, v7
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v6i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <6 x i8>, ptr addrspace(7) %p
@@ -2199,34 +2629,24 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v6i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v6i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2238,30 +2658,26 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v7i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
-; SDAG-NEXT: s_waitcnt vmcnt(2)
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v7i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
-; GISEL-NEXT: s_waitcnt vmcnt(2)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2273,15 +2689,12 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v7i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v5
-; SDAG-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2289,20 +2702,12 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: store_v7i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v8, 8
-; GISEL-NEXT: v_mov_b32_e32 v7, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; GISEL-NEXT: v_and_b32_e32 v0, 0xff, v5
-; GISEL-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2315,31 +2720,29 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v8i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mov_b32_e32 v1, v8
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v8i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v8, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_mov_b32_e32 v4, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, v8
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <8 x i8>, ptr addrspace(7) %p
@@ -2350,40 +2753,28 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v8i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dwordx2 v[3:4], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v8i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v9, 8
-; GISEL-NEXT: v_mov_b32_e32 v8, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v8, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
-; GISEL-NEXT: v_and_or_b32 v1, v4, v8, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
-; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2395,41 +2786,37 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v12i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0
-; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mov_b32_e32 v1, v14
-; SDAG-NEXT: v_mov_b32_e32 v2, v13
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v12i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GISEL-NEXT: v_mov_b32_e32 v4, v1
-; GISEL-NEXT: v_mov_b32_e32 v8, v2
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v12
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <12 x i8>, ptr addrspace(7) %p
@@ -2440,52 +2827,36 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v12i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dwordx3 v[6:8], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v12i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v13, 8
-; GISEL-NEXT: v_mov_b32_e32 v12, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v12, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
-; GISEL-NEXT: v_and_or_b32 v1, v4, v12, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
-; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GISEL-NEXT: v_and_or_b32 v2, v8, v12, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2497,51 +2868,45 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v16i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
+; SDAG-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:13
+; SDAG-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:14
+; SDAG-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:15
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b64 v[18:19], 24, v[0:1]
-; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
-; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3
-; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; SDAG-NEXT: v_mov_b32_e32 v4, v1
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v12, v3
-; SDAG-NEXT: v_mov_b32_e32 v1, v17
-; SDAG-NEXT: v_mov_b32_e32 v2, v16
-; SDAG-NEXT: v_mov_b32_e32 v3, v18
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v16i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
+; GISEL-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:13
+; GISEL-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:14
+; GISEL-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:15
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v16, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v18, 24, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
-; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GISEL-NEXT: v_mov_b32_e32 v4, v1
-; GISEL-NEXT: v_mov_b32_e32 v8, v2
-; GISEL-NEXT: v_mov_b32_e32 v12, v3
-; GISEL-NEXT: v_mov_b32_e32 v1, v16
-; GISEL-NEXT: v_mov_b32_e32 v2, v17
-; GISEL-NEXT: v_mov_b32_e32 v3, v18
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <16 x i8>, ptr addrspace(7) %p
@@ -2552,64 +2917,44 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v16i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
-; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dwordx4 v[9:12], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
+; SDAG-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13
+; SDAG-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14
+; SDAG-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v16i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v17, 8
-; GISEL-NEXT: v_mov_b32_e32 v16, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v16, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
-; GISEL-NEXT: v_and_or_b32 v1, v4, v16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
-; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GISEL-NEXT: v_and_or_b32 v2, v8, v16, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
-; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
-; GISEL-NEXT: v_and_or_b32 v3, v12, v16, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
+; GISEL-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13
+; GISEL-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14
+; GISEL-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2621,87 +2966,77 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v32i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[36:39], off, s[16:19], 0
-; SDAG-NEXT: buffer_load_dwordx4 v[32:35], off, s[16:19], 0 offset:16
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[36:37]
-; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[38:39]
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33]
-; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[34:35]
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v36
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v36
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v37
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v37
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v37
-; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v38
-; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v38
-; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v39
-; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v39
-; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v39
-; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v32
-; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v32
-; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v33
-; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v33
-; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v33
-; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v34
-; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v34
-; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v35
-; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v35
-; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v35
-; SDAG-NEXT: v_mov_b32_e32 v0, v36
-; SDAG-NEXT: v_mov_b32_e32 v4, v37
-; SDAG-NEXT: v_mov_b32_e32 v8, v38
-; SDAG-NEXT: v_mov_b32_e32 v12, v39
-; SDAG-NEXT: v_mov_b32_e32 v16, v32
-; SDAG-NEXT: v_mov_b32_e32 v20, v33
-; SDAG-NEXT: v_mov_b32_e32 v24, v34
-; SDAG-NEXT: v_mov_b32_e32 v28, v35
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
+; SDAG-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:13
+; SDAG-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:14
+; SDAG-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:15
+; SDAG-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:16
+; SDAG-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:17
+; SDAG-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:18
+; SDAG-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:19
+; SDAG-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:20
+; SDAG-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:21
+; SDAG-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:22
+; SDAG-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:23
+; SDAG-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:24
+; SDAG-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:25
+; SDAG-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:26
+; SDAG-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:27
+; SDAG-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:28
+; SDAG-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:29
+; SDAG-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:30
+; SDAG-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:31
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v32i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
-; GISEL-NEXT: buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v35, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v36, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v37, 24, v0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v32, 8, v16
-; GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GISEL-NEXT: v_lshrrev_b32_e32 v34, 24, v16
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
-; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GISEL-NEXT: v_lshrrev_b32_e32 v21, 8, v17
-; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GISEL-NEXT: v_lshrrev_b32_e32 v23, 24, v17
-; GISEL-NEXT: v_lshrrev_b32_e32 v25, 8, v18
-; GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GISEL-NEXT: v_lshrrev_b32_e32 v27, 24, v18
-; GISEL-NEXT: v_lshrrev_b32_e32 v29, 8, v19
-; GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v19
-; GISEL-NEXT: v_lshrrev_b32_e32 v31, 24, v19
-; GISEL-NEXT: v_mov_b32_e32 v4, v1
-; GISEL-NEXT: v_mov_b32_e32 v8, v2
-; GISEL-NEXT: v_mov_b32_e32 v12, v3
-; GISEL-NEXT: v_mov_b32_e32 v20, v17
-; GISEL-NEXT: v_mov_b32_e32 v24, v18
-; GISEL-NEXT: v_mov_b32_e32 v28, v19
-; GISEL-NEXT: v_mov_b32_e32 v1, v35
-; GISEL-NEXT: v_mov_b32_e32 v2, v36
-; GISEL-NEXT: v_mov_b32_e32 v3, v37
-; GISEL-NEXT: v_mov_b32_e32 v17, v32
-; GISEL-NEXT: v_mov_b32_e32 v18, v33
-; GISEL-NEXT: v_mov_b32_e32 v19, v34
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:11
+; GISEL-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:13
+; GISEL-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:14
+; GISEL-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:15
+; GISEL-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:16
+; GISEL-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:17
+; GISEL-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:18
+; GISEL-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:19
+; GISEL-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:20
+; GISEL-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:21
+; GISEL-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:22
+; GISEL-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:23
+; GISEL-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:24
+; GISEL-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:25
+; GISEL-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:26
+; GISEL-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:27
+; GISEL-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:28
+; GISEL-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:29
+; GISEL-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:30
+; GISEL-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:31
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <32 x i8>, ptr addrspace(7) %p
@@ -2712,117 +3047,82 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_v32i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
-; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_load_ubyte v10, off, s[0:3], s32
-; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_lshlrev_b16_e32 v7, 8, v7
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v11, 8, v29
-; SDAG-NEXT: v_lshlrev_b16_e32 v14, 8, v25
-; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v27
-; SDAG-NEXT: v_lshlrev_b16_e32 v21, 8, v21
-; SDAG-NEXT: v_lshlrev_b16_e32 v23, 8, v23
-; SDAG-NEXT: v_lshlrev_b16_e32 v17, 8, v17
-; SDAG-NEXT: v_lshlrev_b16_e32 v19, 8, v19
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; SDAG-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v5, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v4, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v3, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v10
-; SDAG-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; SDAG-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; SDAG-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; SDAG-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; SDAG-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
+; SDAG-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12
+; SDAG-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13
+; SDAG-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14
+; SDAG-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15
+; SDAG-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16
+; SDAG-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17
+; SDAG-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18
+; SDAG-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19
+; SDAG-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20
+; SDAG-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21
+; SDAG-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22
+; SDAG-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23
+; SDAG-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24
+; SDAG-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25
+; SDAG-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26
+; SDAG-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27
+; SDAG-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28
+; SDAG-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29
+; SDAG-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30
+; SDAG-NEXT: s_waitcnt vmcnt(27)
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:31
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_v32i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v31, 8
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_mov_b32_e32 v32, 0xff
-; GISEL-NEXT: v_and_or_b32 v0, v0, v32, v1
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GISEL-NEXT: buffer_load_ubyte v7, off, s[0:3], s32
-; GISEL-NEXT: v_and_or_b32 v1, v4, v32, v1
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
-; GISEL-NEXT: v_or3_b32 v1, v1, v4, v5
-; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
-; GISEL-NEXT: v_and_or_b32 v2, v8, v32, v2
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
-; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
-; GISEL-NEXT: v_and_or_b32 v3, v12, v32, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
-; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v18
-; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v19
-; GISEL-NEXT: v_and_or_b32 v4, v16, v32, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_or3_b32 v4, v4, v5, v6
-; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v22
-; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v23
-; GISEL-NEXT: v_and_or_b32 v8, v20, v32, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; GISEL-NEXT: v_or3_b32 v5, v8, v5, v6
-; GISEL-NEXT: v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v26
-; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v27
-; GISEL-NEXT: v_and_or_b32 v6, v24, v32, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; GISEL-NEXT: v_or3_b32 v6, v6, v8, v9
-; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v30
-; GISEL-NEXT: v_and_or_b32 v8, v28, v32, v8
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; GISEL-NEXT: v_or3_b32 v7, v8, v9, v7
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_load_ushort v0, off, s[0:3], s32
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
+; GISEL-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
+; GISEL-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
+; GISEL-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
+; GISEL-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
+; GISEL-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12
+; GISEL-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13
+; GISEL-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14
+; GISEL-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15
+; GISEL-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16
+; GISEL-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17
+; GISEL-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18
+; GISEL-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19
+; GISEL-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20
+; GISEL-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21
+; GISEL-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22
+; GISEL-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23
+; GISEL-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24
+; GISEL-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25
+; GISEL-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26
+; GISEL-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27
+; GISEL-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28
+; GISEL-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29
+; GISEL-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30
+; GISEL-NEXT: s_waitcnt vmcnt(27)
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:31
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2912,39 +3212,58 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_a2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_a2f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load [2 x half], ptr addrspace(7) %p
ret [2 x half] %ret
}
+define [2 x half] @load_a2f16_align4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16_align4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16_align4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x half], ptr addrspace(7) %p, align 4
+ ret [2 x half] %ret
+}
+
define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_a2f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, 0x5040100
-; SDAG-NEXT: v_perm_b32 v0, v1, v0, s4
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: store_a2f16:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2952,6 +3271,27 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define void @store_a2f16_align4([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16_align4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16_align4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x half] %data, ptr addrspace(7) %p, align 4
+ ret void
+}
+
define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_a2p1:
; SDAG: ; %bb.0:
@@ -3312,10 +3652,13 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v4i4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0
; SDAG-NEXT: v_mov_b32_e32 v2, 15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: buffer_store_short v0, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32 offset:1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_byte v1, off, s[0:3], s32
; SDAG-NEXT: buffer_load_ushort v1, off, s[0:3], s32
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_lshrrev_b16_e32 v4, 4, v1
@@ -3328,11 +3671,12 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: load_v4i4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:1
+; GISEL-NEXT: s_waitcnt vmcnt(1)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 4, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <4 x i4>, ptr addrspace(7) %p
@@ -3352,7 +3696,9 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-NEXT: v_lshlrev_b16_e32 v1, 12, v3
; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -3363,13 +3709,12 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, 15
-; GISEL-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 15, v3
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 12, v1
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 4, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3381,30 +3726,45 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: load_v8i4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v7, off, s[16:19], 0
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v0, 15, v7
-; SDAG-NEXT: v_bfe_u32 v1, v7, 4, 4
-; SDAG-NEXT: v_bfe_u32 v2, v7, 8, 4
-; SDAG-NEXT: v_bfe_u32 v3, v7, 12, 4
-; SDAG-NEXT: v_bfe_u32 v4, v7, 16, 4
-; SDAG-NEXT: v_bfe_u32 v5, v7, 20, 4
-; SDAG-NEXT: v_bfe_u32 v6, v7, 24, 4
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 28, v7
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32 offset:3
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: buffer_store_byte v1, off, s[0:3], s32 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: buffer_store_byte v2, off, s[0:3], s32 offset:1
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: buffer_store_byte v3, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_dword v6, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v6
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 28, v6
+; SDAG-NEXT: v_bfe_u32 v1, v6, 4, 4
+; SDAG-NEXT: v_bfe_u32 v2, v6, 8, 4
+; SDAG-NEXT: v_bfe_u32 v3, v6, 12, 4
+; SDAG-NEXT: v_bfe_u32 v4, v6, 16, 4
+; SDAG-NEXT: v_bfe_u32 v5, v6, 20, 4
+; SDAG-NEXT: v_bfe_u32 v6, v6, 24, 4
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: load_v8i4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:3
+; GISEL-NEXT: s_waitcnt vmcnt(3)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 24, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 4, v2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 4, v4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 4, v6
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <8 x i4>, ptr addrspace(7) %p
@@ -3431,7 +3791,13 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
; SDAG-NEXT: v_lshlrev_b32_e32 v1, 28, v7
; SDAG-NEXT: v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-NEXT: v_or3_b32 v0, v0, v2, v1
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:3
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -3439,23 +3805,37 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GISEL-NEXT: v_and_or_b32 v0, v0, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-NEXT: v_and_b32_e32 v1, 15, v2
; GISEL-NEXT: v_and_b32_e32 v2, 15, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 12, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 4, v2
; GISEL-NEXT: v_and_b32_e32 v3, 15, v5
-; GISEL-NEXT: v_and_b32_sdwa v2, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v3
-; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
-; GISEL-NEXT: v_and_b32_e32 v2, 15, v7
-; GISEL-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 28, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v4
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 4, v3
+; GISEL-NEXT: v_and_b32_e32 v4, 15, v7
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 15, v6
+; GISEL-NEXT: v_lshlrev_b16_e32 v4, 4, v4
+; GISEL-NEXT: v_mov_b32_e32 v5, 8
+; GISEL-NEXT: v_or_b32_e32 v3, v3, v4
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:3
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3534,7 +3914,7 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- %ret = load <32 x i6>, ptr addrspace(7) %p
+ %ret = load <32 x i6>, ptr addrspace(7) %p, align 4
%ret.cast = bitcast <32 x i6> %ret to <6 x i32>
ret <6 x i32> %ret.cast
}
@@ -3557,7 +3937,7 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_setpc_b64 s[30:31]
%data = bitcast <6 x i32> %data.abi to <32 x i6>
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- store <32 x i6> %data, ptr addrspace(7) %p
+ store <32 x i6> %data, ptr addrspace(7) %p, align 4
ret void
}
@@ -3567,21 +3947,21 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: volatile_load_v4i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 glc
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 glc
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 glc
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: volatile_load_v4i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 glc
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 glc
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 glc
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 glc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load volatile <4 x i8>, ptr addrspace(7) %p
@@ -3592,28 +3972,20 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: volatile_store_v4i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: volatile_store_v4i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v5, 8
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
-; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3625,30 +3997,25 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: volatile_load_v6i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
-; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 glc
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 glc
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 glc
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 glc
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4 glc
+; SDAG-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5 glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; SDAG-NEXT: v_mov_b32_e32 v4, v6
-; SDAG-NEXT: v_mov_b32_e32 v1, v7
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: volatile_load_v6i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
-; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 glc
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1 glc
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 glc
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3 glc
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4 glc
+; GISEL-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:5 glc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load volatile <6 x i8>, ptr addrspace(7) %p
@@ -3659,34 +4026,24 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: volatile_store_v6i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: volatile_store_v6i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
-; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index ffa9b465af0dd..5eb9f834da0d0 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=SDAG-GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG-GFX1100 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL-GFX1100 %s
; Note: if you're adding tests here, also add them to
; lower-buffer-fat-pointers-mem-transfer.ll to verify the IR produced by
@@ -244,90 +244,82 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa
; SDAG-GFX942-NEXT: .p2align 8
; SDAG-GFX942-NEXT: ; %bb.4:
; SDAG-GFX942-NEXT: .LBB0_0:
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; SDAG-GFX942-NEXT: s_load_dword s17, s[4:5], 0x34
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44
-; SDAG-GFX942-NEXT: s_load_dword s12, s[4:5], 0x54
-; SDAG-GFX942-NEXT: s_mov_b32 s16, 0
-; SDAG-GFX942-NEXT: s_mov_b32 s5, s16
+; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; SDAG-GFX942-NEXT: s_load_dword s15, s[4:5], 0x30
+; SDAG-GFX942-NEXT: s_mov_b32 s14, 0
+; SDAG-GFX942-NEXT: s_mov_b32 s5, s14
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[16:17]
-; SDAG-GFX942-NEXT: s_mov_b32 s17, s2
+; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[14:15]
+; SDAG-GFX942-NEXT: s_mov_b32 s15, s2
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX942-NEXT: s_mov_b32 s3, s16
-; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[16:17]
-; SDAG-GFX942-NEXT: s_mov_b32 s17, s12
-; SDAG-GFX942-NEXT: s_mov_b32 s2, s11
-; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[16:17]
-; SDAG-GFX942-NEXT: s_mov_b32 s17, s10
-; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
-; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[16:17]
+; SDAG-GFX942-NEXT: s_mov_b32 s3, s14
+; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[14:15]
; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
+; SDAG-GFX942-NEXT: s_add_i32 s1, s12, s14
; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[8:11], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[8:11], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[8:11], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_add_i32 s2, s0, s14
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
-; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000
+; SDAG-GFX942-NEXT: s_addk_i32 s14, 0x100
+; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s14, 0x2000
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[8:11], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[8:11], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[8:11], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[8:11], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[8:11], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[8:11], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[8:11], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[8:11], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[8:11], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[8:11], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[8:11], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[8:11], 0 offen offset:224
; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[8:11], 0 offen offset:240
; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: s_nop 1
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[4:7], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[4:7], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[4:7], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[4:7], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[4:7], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[4:7], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[4:7], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[4:7], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[4:7], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[4:7], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[4:7], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -335,10 +327,10 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa
; SDAG-GFX1100-LABEL: memcpy_known:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_clause 0x3
-; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x34
-; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
-; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54
+; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; SDAG-GFX1100-NEXT: s_load_b32 s17, s[4:5], 0x10
+; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
+; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x30
; SDAG-GFX1100-NEXT: s_mov_b32 s16, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s16
@@ -422,37 +414,29 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa
;
; GISEL-GFX942-LABEL: memcpy_known:
; GISEL-GFX942: ; %bb.0:
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GISEL-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10
+; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x30
+; GISEL-GFX942-NEXT: s_mov_b32 s15, 0
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
-; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GISEL-GFX942-NEXT: s_mov_b32 s14, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s4, s15
+; GISEL-GFX942-NEXT: s_mov_b32 s5, s2
+; GISEL-GFX942-NEXT: s_mov_b32 s13, 0
+; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5]
+; GISEL-GFX942-NEXT: s_mov_b32 s14, s3
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
+; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[14:15], s[6:7]
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s13
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v1
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s0, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -510,10 +494,10 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa
; GISEL-GFX1100-LABEL: memcpy_known:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_clause 0x3
-; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
-; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54
+; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
+; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x10
+; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x30
; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0
; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17
@@ -811,33 +795,25 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; SDAG-GFX942-NEXT: .p2align 8
; SDAG-GFX942-NEXT: ; %bb.4:
; SDAG-GFX942-NEXT: .LBB1_0:
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x44
-; SDAG-GFX942-NEXT: s_load_dword s14, s[4:5], 0x54
-; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
-; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
+; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; SDAG-GFX942-NEXT: s_load_dword s15, s[4:5], 0x30
+; SDAG-GFX942-NEXT: s_mov_b32 s14, 0
+; SDAG-GFX942-NEXT: s_mov_b32 s5, s14
; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
-; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s2
+; SDAG-GFX942-NEXT: s_or_b64 s[6:7], s[4:5], s[14:15]
+; SDAG-GFX942-NEXT: s_mov_b32 s15, s2
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
-; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s14
-; SDAG-GFX942-NEXT: s_mov_b32 s2, s11
-; SDAG-GFX942-NEXT: s_or_b64 s[14:15], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s10
-; SDAG-GFX942-NEXT: s_mov_b32 s2, s9
-; SDAG-GFX942-NEXT: s_or_b64 s[12:13], s[2:3], s[12:13]
+; SDAG-GFX942-NEXT: s_mov_b32 s3, s14
+; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[14:15]
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
+; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s12, v0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
+; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s0, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -845,22 +821,22 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2)
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
@@ -870,24 +846,24 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -895,10 +871,10 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; SDAG-GFX1100-LABEL: memcpy_known_medium:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_clause 0x3
-; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34
-; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
-; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x54
+; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x10
+; SDAG-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
+; SDAG-GFX1100-NEXT: s_load_b32 s18, s[4:5], 0x30
; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
@@ -979,36 +955,28 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
;
; GISEL-GFX942-LABEL: memcpy_known_medium:
; GISEL-GFX942: ; %bb.0:
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GISEL-GFX942-NEXT: s_load_dword s12, s[4:5], 0x10
+; GISEL-GFX942-NEXT: s_load_dword s7, s[4:5], 0x30
+; GISEL-GFX942-NEXT: s_mov_b32 s15, 0
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
-; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; GISEL-GFX942-NEXT: s_mov_b32 s14, s1
+; GISEL-GFX942-NEXT: s_mov_b32 s4, s15
+; GISEL-GFX942-NEXT: s_mov_b32 s5, s2
+; GISEL-GFX942-NEXT: s_mov_b32 s13, 0
+; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5]
+; GISEL-GFX942-NEXT: s_mov_b32 s14, s3
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[14:15], s[6:7]
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s13
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
+; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s12, v0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
+; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
@@ -1068,10 +1036,10 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; GISEL-GFX1100-LABEL: memcpy_known_medium:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_clause 0x3
-; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
-; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x54
+; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
+; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x10
+; GISEL-GFX1100-NEXT: s_load_b32 s15, s[4:5], 0x30
; GISEL-GFX1100-NEXT: s_mov_b32 s17, 0
; GISEL-GFX1100-NEXT: s_mov_b32 s12, 0
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s17
@@ -1200,21 +1168,11 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
; SDAG-GFX942-NEXT: .p2align 8
; SDAG-GFX942-NEXT: ; %bb.2:
; SDAG-GFX942-NEXT: .LBB2_0:
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x34
-; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
-; SDAG-GFX942-NEXT: s_mov_b32 s7, s12
-; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX942-NEXT: s_mov_b32 s6, s3
-; SDAG-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13]
-; SDAG-GFX942-NEXT: s_mov_b32 s13, s2
-; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
-; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
-; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s12
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
-; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
-; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
+; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
+; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x30
+; SDAG-GFX942-NEXT: s_mov_b32 s12, 0
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
; SDAG-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX942-NEXT: s_mov_b32 s4, s3
@@ -1234,8 +1192,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
; SDAG-GFX1100-LABEL: memcpy_known_small:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_clause 0x1
-; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x34
+; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x10
; SDAG-GFX1100-NEXT: s_mov_b32 s12, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-GFX1100-NEXT: s_mov_b32 s7, s12
@@ -1250,8 +1208,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX1100-NEXT: s_clause 0x1
-; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54
-; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
+; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x30
+; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x20
; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12
; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0
@@ -1271,33 +1229,25 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
;
; GISEL-GFX942-LABEL: memcpy_known_small:
; GISEL-GFX942: ; %bb.0:
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
+; GISEL-GFX942-NEXT: s_load_dword s6, s[4:5], 0x10
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GISEL-GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x30
; GISEL-GFX942-NEXT: s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT: s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s10, s7
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
-; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
-; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX942-NEXT: s_mov_b32 s6, s9
; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT: s_mov_b32 s12, s7
-; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s1
-; GISEL-GFX942-NEXT: s_mov_b32 s5, s2
+; GISEL-GFX942-NEXT: s_mov_b32 s5, s10
; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX942-NEXT: s_mov_b32 s6, s11
+; GISEL-GFX942-NEXT: s_mov_b32 s12, s7
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s8
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_endpgm
@@ -1305,8 +1255,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
; GISEL-GFX1100-LABEL: memcpy_known_small:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_clause 0x1
-; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x34
+; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x10
; GISEL-GFX1100-NEXT: s_mov_b32 s13, 0
; GISEL-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-GFX1100-NEXT: s_mov_b32 s8, s13
@@ -1321,8 +1271,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr a
; GISEL-GFX1100-NEXT: s_or_b64 s[2:3], s[12:13], s[6:7]
; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
; GISEL-GFX1100-NEXT: s_clause 0x1
-; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x44
-; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54
+; GISEL-GFX1100-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
+; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x30
; GISEL-GFX1100-NEXT: s_mov_b32 s4, s13
; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 3765bb0af79ba..4a2edf7d14751 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -93,7 +93,7 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
; CHECK-NEXT: [[V_INT_LEGAL:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
; CHECK-NEXT: [[V_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
-; CHECK-NEXT: [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
+; CHECK-NEXT: [[ARG_PART_4:%.*]] = add i32 [[ARG_OFF]], 16
; CHECK-NEXT: [[V_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_LEGAL]], i64 4
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization-alignment.ll
new file mode 100644
index 0000000000000..de29f6a305eaf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization-alignment.ll
@@ -0,0 +1,3163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s --check-prefix=STRICT
+;; Note: unaligned-access-mode is default on HSA targets
+; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers -mattr=+unaligned-access-mode < %s | FileCheck %s --check-prefix=UNALIGNED_ONLY
+; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers -mattr=+relaxed-buffer-oob-mode < %s | FileCheck %s --check-prefix=RELAXED_OOB_ONLY
+; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers -mattr=+unaligned-access-mode,+relaxed-buffer-oob-mode < %s | FileCheck %s --check-prefix=BOTH_FLAGS
+
+target triple = "amdgcn--"
+
+define i32 @load_i32_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i32 @load_i32_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
+; STRICT-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret i32 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i32 @load_i32_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i32 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i32 @load_i32_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret i32 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i32 @load_i32_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i32 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i32, ptr addrspace(7) %q, align 4
+ ret i32 %ret
+}
+
+define void @store_i32_align4(i32 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i32_align4(
+; STRICT-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i32_align4(
+; UNALIGNED_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i32_align4(
+; RELAXED_OOB_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i32_align4(
+; BOTH_FLAGS-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i32 %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define i32 @load_i32_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i32 @load_i32_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; STRICT-NEXT: [[RET:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to i32
+; STRICT-NEXT: ret i32 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i32 @load_i32_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i32 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i32 @load_i32_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to i32
+; RELAXED_OOB_ONLY-NEXT: ret i32 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i32 @load_i32_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i32 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i32, ptr addrspace(7) %q, align 2
+ ret i32 %ret
+}
+
+define void @store_i32_align2(i32 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i32_align2(
+; STRICT-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast i32 [[DATA]] to <2 x i16>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i32_align2(
+; UNALIGNED_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i32_align2(
+; RELAXED_OOB_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast i32 [[DATA]] to <2 x i16>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i32_align2(
+; BOTH_FLAGS-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i32 %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define i32 @load_i32_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i32 @load_i32_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[RET:%.*]] = bitcast <4 x i8> [[RET_SLICE_3]] to i32
+; STRICT-NEXT: ret i32 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i32 @load_i32_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i32 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i32 @load_i32_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <4 x i8> [[RET_SLICE_3]] to i32
+; RELAXED_OOB_ONLY-NEXT: ret i32 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i32 @load_i32_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i32 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i32, ptr addrspace(7) %q, align 1
+ ret i32 %ret
+}
+
+define void @store_i32_align1(i32 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i32_align1(
+; STRICT-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast i32 [[DATA]] to <4 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i32_align1(
+; UNALIGNED_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i32_align1(
+; RELAXED_OOB_ONLY-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast i32 [[DATA]] to <4 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i32_align1(
+; BOTH_FLAGS-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i32 %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define i64 @load_i64_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i64 @load_i64_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret i64 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i64 @load_i64_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i64 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i64 @load_i64_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret i64 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i64 @load_i64_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i64 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i64, ptr addrspace(7) %q, align 4
+ ret i64 %ret
+}
+
+define void @store_i64_align4(i64 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i64_align4(
+; STRICT-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i64_align4(
+; UNALIGNED_ONLY-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i64_align4(
+; RELAXED_OOB_ONLY-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i64_align4(
+; BOTH_FLAGS-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i64 %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define i64 @load_i64_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i64 @load_i64_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to i64
+; STRICT-NEXT: ret i64 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i64 @load_i64_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i64 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i64 @load_i64_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to i64
+; RELAXED_OOB_ONLY-NEXT: ret i64 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i64 @load_i64_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i64 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i64, ptr addrspace(7) %q, align 1
+ ret i64 %ret
+}
+
+define void @store_i64_align1(i64 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i64_align1(
+; STRICT-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast i64 [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i64_align1(
+; UNALIGNED_ONLY-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i64_align1(
+; RELAXED_OOB_ONLY-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast i64 [[DATA]] to <8 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i64_align1(
+; BOTH_FLAGS-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i64 %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <2 x i32> @load_v2i32_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i32> @load_v2i32_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <2 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; STRICT-NEXT: ret <2 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i32> @load_v2i32_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i32> @load_v2i32_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i32> @load_v2i32_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i32>, ptr addrspace(7) %q, align 4
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32_align4(<2 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i32_align4(
+; STRICT-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i32> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i32> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i32_align4(
+; UNALIGNED_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i32> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i32> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i32_align4(
+; RELAXED_OOB_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i32_align4(
+; BOTH_FLAGS-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i32> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <2 x i32> @load_v2i32_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i32> @load_v2i32_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; STRICT-NEXT: [[RET:%.*]] = bitcast <4 x i16> [[RET_SLICE_3]] to <2 x i32>
+; STRICT-NEXT: ret <2 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i32> @load_v2i32_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i32> @load_v2i32_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <4 x i16> [[RET_SLICE_3]] to <2 x i32>
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i32> @load_v2i32_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i32>, ptr addrspace(7) %q, align 2
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32_align2(<2 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i32_align2(
+; STRICT-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i32> [[DATA]] to <4 x i16>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i32_align2(
+; UNALIGNED_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i32> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i32> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i32_align2(
+; RELAXED_OOB_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i32> [[DATA]] to <4 x i16>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i32_align2(
+; BOTH_FLAGS-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i32> %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define <2 x i32> @load_v2i32_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i32> @load_v2i32_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <2 x i32>
+; STRICT-NEXT: ret <2 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i32> @load_v2i32_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i32> @load_v2i32_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <2 x i32>
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i32> @load_v2i32_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i32>, ptr addrspace(7) %q, align 1
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32_align1(<2 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i32_align1(
+; STRICT-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i32> [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i32_align1(
+; UNALIGNED_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i32> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i32> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i32_align1(
+; RELAXED_OOB_ONLY-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i32> [[DATA]] to <8 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i32_align1(
+; BOTH_FLAGS-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i32> %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <3 x i32> @load_v3i32_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <3 x i32> @load_v3i32_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <3 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <3 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <3 x i32> [[RET_SLICE_1]], i32 [[RET_OFF_8]], i64 2
+; STRICT-NEXT: ret <3 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <3 x i32> @load_v3i32_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <3 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <3 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <3 x i32> [[RET_SLICE_1]], i32 [[RET_OFF_8]], i64 2
+; UNALIGNED_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <3 x i32> @load_v3i32_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <3 x i32> @load_v3i32_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <3 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <3 x i32>, ptr addrspace(7) %q, align 4
+ ret <3 x i32> %ret
+}
+
+define void @store_v3i32_align4(<3 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v3i32_align4(
+; STRICT-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <3 x i32> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <3 x i32> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i32> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v3i32_align4(
+; UNALIGNED_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <3 x i32> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <3 x i32> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i32> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v3i32_align4(
+; RELAXED_OOB_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v3i32_align4(
+; BOTH_FLAGS-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <3 x i32> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <3 x i32> @load_v3i32_align8(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <3 x i32> @load_v3i32_align8(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i32> poison, <3 x i32> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <3 x i32> [[RET_PARTS_0]], i32 [[RET_OFF_8]], i64 2
+; STRICT-NEXT: ret <3 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <3 x i32> @load_v3i32_align8(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i32> poison, <3 x i32> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <3 x i32> [[RET_PARTS_0]], i32 [[RET_OFF_8]], i64 2
+; UNALIGNED_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <3 x i32> @load_v3i32_align8(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <3 x i32> @load_v3i32_align8(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <3 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <3 x i32>, ptr addrspace(7) %q, align 8
+ ret <3 x i32> %ret
+}
+
+define void @store_v3i32_align8(<3 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v3i32_align8(
+; STRICT-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i32> [[DATA]], <3 x i32> poison, <2 x i32> <i32 0, i32 1>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i32> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v3i32_align8(
+; UNALIGNED_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i32> [[DATA]], <3 x i32> poison, <2 x i32> <i32 0, i32 1>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i32> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v3i32_align8(
+; RELAXED_OOB_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v3i32_align8(
+; BOTH_FLAGS-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <3 x i32> %data, ptr addrspace(7) %q, align 8
+ ret void
+}
+
+define <3 x i32> @load_v3i32_align16(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <3 x i32> @load_v3i32_align16(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret <3 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <3 x i32> @load_v3i32_align16(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <3 x i32> @load_v3i32_align16(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <3 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <3 x i32> @load_v3i32_align16(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <3 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <3 x i32>, ptr addrspace(7) %q, align 16
+ ret <3 x i32> %ret
+}
+
+define void @store_v3i32_align16(<3 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v3i32_align16(
+; STRICT-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v3i32_align16(
+; UNALIGNED_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v3i32_align16(
+; RELAXED_OOB_ONLY-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v3i32_align16(
+; BOTH_FLAGS-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <3 x i32> %data, ptr addrspace(7) %q, align 16
+ ret void
+}
+
+define <4 x i32> @load_v4i32_align8(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <4 x i32> @load_v4i32_align8(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <4 x i32> poison, <4 x i32> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET:%.*]] = shufflevector <4 x i32> [[RET_PARTS_0]], <4 x i32> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; STRICT-NEXT: ret <4 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <4 x i32> @load_v4i32_align8(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <4 x i32> poison, <4 x i32> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = shufflevector <4 x i32> [[RET_PARTS_0]], <4 x i32> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; UNALIGNED_ONLY-NEXT: ret <4 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <4 x i32> @load_v4i32_align8(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <4 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <4 x i32> @load_v4i32_align8(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <4 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <4 x i32>, ptr addrspace(7) %q, align 8
+ ret <4 x i32> %ret
+}
+
+define void @store_v4i32_align8(<4 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v4i32_align8(
+; STRICT-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <4 x i32> [[DATA]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <4 x i32> [[DATA]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v4i32_align8(
+; UNALIGNED_ONLY-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <4 x i32> [[DATA]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <4 x i32> [[DATA]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v4i32_align8(
+; RELAXED_OOB_ONLY-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v4i32_align8(
+; BOTH_FLAGS-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <4 x i32> %data, ptr addrspace(7) %q, align 8
+ ret void
+}
+
+define <8 x i32> @load_v8i32_align16(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <8 x i32> @load_v8i32_align16(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; STRICT-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; STRICT-NEXT: ret <8 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <8 x i32> @load_v8i32_align16(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; UNALIGNED_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <8 x i32> @load_v8i32_align16(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; RELAXED_OOB_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <8 x i32> @load_v8i32_align16(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BOTH_FLAGS-NEXT: ret <8 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <8 x i32>, ptr addrspace(7) %q, align 16
+ ret <8 x i32> %ret
+}
+
+define void @store_v8i32_align16(<8 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v8i32_align16(
+; STRICT-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v8i32_align16(
+; UNALIGNED_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v8i32_align16(
+; RELAXED_OOB_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v8i32_align16(
+; BOTH_FLAGS-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <8 x i32> %data, ptr addrspace(7) %q, align 16
+ ret void
+}
+
+define <8 x i32> @load_v8i32_align8(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <8 x i32> @load_v8i32_align8(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_2:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_2]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; STRICT-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_2]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; STRICT-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[RET_OFF_24:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_6:%.*]] = shufflevector <2 x i32> [[RET_OFF_24]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_4]], <8 x i32> [[RET_EXT_6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; STRICT-NEXT: ret <8 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <8 x i32> @load_v8i32_align8(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_2:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_2]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_2]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_24:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_6:%.*]] = shufflevector <2 x i32> [[RET_OFF_24]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_4]], <8 x i32> [[RET_EXT_6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <8 x i32> @load_v8i32_align8(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; RELAXED_OOB_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <8 x i32> @load_v8i32_align8(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BOTH_FLAGS-NEXT: ret <8 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <8 x i32>, ptr addrspace(7) %q, align 8
+ ret <8 x i32> %ret
+}
+
+define void @store_v8i32_align8(<8 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v8i32_align8(
+; STRICT-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_6]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v8i32_align8(
+; UNALIGNED_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_6]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v8i32_align8(
+; RELAXED_OOB_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v8i32_align8(
+; BOTH_FLAGS-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <8 x i32> %data, ptr addrspace(7) %q, align 8
+ ret void
+}
+
+define <8 x i32> @load_v8i32_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <8 x i32> @load_v8i32_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i32> [[RET_SLICE_1]], i32 [[RET_OFF_8]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_12:%.*]] = add i32 [[Q]], 12
+; STRICT-NEXT: [[RET_OFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_12]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i32> [[RET_SLICE_2]], i32 [[RET_OFF_12]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[RET_OFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i32> [[RET_SLICE_3]], i32 [[RET_OFF_16]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_20:%.*]] = add i32 [[Q]], 20
+; STRICT-NEXT: [[RET_OFF_20:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_20]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i32> [[RET_SLICE_4]], i32 [[RET_OFF_20]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[RET_OFF_24:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i32> [[RET_SLICE_5]], i32 [[RET_OFF_24]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_28:%.*]] = add i32 [[Q]], 28
+; STRICT-NEXT: [[RET_OFF_28:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_28]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <8 x i32> [[RET_SLICE_6]], i32 [[RET_OFF_28]], i64 7
+; STRICT-NEXT: ret <8 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <8 x i32> @load_v8i32_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i32> [[RET_SLICE_1]], i32 [[RET_OFF_8]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_12:%.*]] = add i32 [[Q]], 12
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_12]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i32> [[RET_SLICE_2]], i32 [[RET_OFF_12]], i64 3
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i32> [[RET_SLICE_3]], i32 [[RET_OFF_16]], i64 4
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_20:%.*]] = add i32 [[Q]], 20
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_20:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_20]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i32> [[RET_SLICE_4]], i32 [[RET_OFF_20]], i64 5
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_24:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i32> [[RET_SLICE_5]], i32 [[RET_OFF_24]], i64 6
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_28:%.*]] = add i32 [[Q]], 28
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_28:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_28]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <8 x i32> [[RET_SLICE_6]], i32 [[RET_OFF_28]], i64 7
+; UNALIGNED_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <8 x i32> @load_v8i32_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; RELAXED_OOB_ONLY-NEXT: ret <8 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <8 x i32> @load_v8i32_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BOTH_FLAGS-NEXT: ret <8 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <8 x i32>, ptr addrspace(7) %q, align 4
+ ret <8 x i32> %ret
+}
+
+define void @store_v8i32_align4(<8 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v8i32_align4(
+; STRICT-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i32> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i32> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i32> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 12
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i32> [[DATA]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_3]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i32> [[DATA]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 20
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i32> [[DATA]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_5]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i32> [[DATA]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 28
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i32> [[DATA]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_7]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v8i32_align4(
+; UNALIGNED_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i32> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i32> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_1]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i32> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 12
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i32> [[DATA]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_3]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i32> [[DATA]], i64 4
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 20
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i32> [[DATA]], i64 5
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_5]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i32> [[DATA]], i64 6
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 28
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i32> [[DATA]], i64 7
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_7]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v8i32_align4(
+; RELAXED_OOB_ONLY-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v8i32_align4(
+; BOTH_FLAGS-SAME: <8 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <8 x i32> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <10 x i32> @load_v10i32_align16(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <10 x i32> @load_v10i32_align16(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_32:%.*]] = add i32 [[Q]], 32
+; STRICT-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; STRICT-NEXT: ret <10 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <10 x i32> @load_v10i32_align16(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_32:%.*]] = add i32 [[Q]], 32
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; UNALIGNED_ONLY-NEXT: ret <10 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <10 x i32> @load_v10i32_align16(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_32:%.*]] = add nuw i32 [[Q]], 32
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; RELAXED_OOB_ONLY-NEXT: ret <10 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <10 x i32> @load_v10i32_align16(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_32:%.*]] = add nuw i32 [[Q]], 32
+; BOTH_FLAGS-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; BOTH_FLAGS-NEXT: ret <10 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <10 x i32>, ptr addrspace(7) %q, align 16
+ ret <10 x i32> %ret
+}
+
+define void @store_v10i32_align16(<10 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v10i32_align16(
+; STRICT-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_8:%.*]] = add i32 [[Q]], 32
+; STRICT-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v10i32_align16(
+; UNALIGNED_ONLY-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_8:%.*]] = add i32 [[Q]], 32
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v10i32_align16(
+; RELAXED_OOB_ONLY-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_8:%.*]] = add nuw i32 [[Q]], 32
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v10i32_align16(
+; BOTH_FLAGS-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_8:%.*]] = add nuw i32 [[Q]], 32
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <10 x i32> %data, ptr addrspace(7) %q, align 16
+ ret void
+}
+
+define <10 x i32> @load_v10i32_align8(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <10 x i32> @load_v10i32_align8(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_2:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_2]], <10 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_2]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[RET_OFF_24:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_6:%.*]] = shufflevector <2 x i32> [[RET_OFF_24]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET_PARTS_6:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_6]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+; STRICT-NEXT: [[Q_OFF_PTR_32:%.*]] = add i32 [[Q]], 32
+; STRICT-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; STRICT-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_6]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; STRICT-NEXT: ret <10 x i32> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <10 x i32> @load_v10i32_align8(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i32> [[RET_OFF_0]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_8:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x i32> [[RET_OFF_8]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_2:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_2]], <10 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_2]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_24:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_24:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_24]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_6:%.*]] = shufflevector <2 x i32> [[RET_OFF_24]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET_PARTS_6:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_6]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_32:%.*]] = add i32 [[Q]], 32
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_6]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; UNALIGNED_ONLY-NEXT: ret <10 x i32> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <10 x i32> @load_v10i32_align8(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_32:%.*]] = add nuw i32 [[Q]], 32
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; RELAXED_OOB_ONLY-NEXT: ret <10 x i32> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <10 x i32> @load_v10i32_align8(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x i32> poison, <10 x i32> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_16:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_16]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x i32> [[RET_PARTS_0]], <10 x i32> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; BOTH_FLAGS-NEXT: [[Q_OFF_PTR_32:%.*]] = add nuw i32 [[Q]], 32
+; BOTH_FLAGS-NEXT: [[RET_OFF_32:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q_OFF_PTR_32]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x i32> [[RET_OFF_32]], <2 x i32> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = shufflevector <10 x i32> [[RET_PARTS_4]], <10 x i32> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; BOTH_FLAGS-NEXT: ret <10 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <10 x i32>, ptr addrspace(7) %q, align 8
+ ret <10 x i32> %ret
+}
+
+define void @store_v10i32_align8(<10 x i32> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v10i32_align8(
+; STRICT-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 0, i32 1>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 2, i32 3>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 4, i32 5>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 6, i32 7>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_6]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_8:%.*]] = add i32 [[Q]], 32
+; STRICT-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v10i32_align8(
+; UNALIGNED_ONLY-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 0, i32 1>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 8
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 2, i32 3>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_2]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 16
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 4, i32 5>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 24
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 6, i32 7>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_6]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_8:%.*]] = add i32 [[Q]], 32
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v10i32_align8(
+; RELAXED_OOB_ONLY-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_8:%.*]] = add nuw i32 [[Q]], 32
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v10i32_align8(
+; BOTH_FLAGS-SAME: <10 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 16
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[Q_PART_8:%.*]] = add nuw i32 [[Q]], 32
+; BOTH_FLAGS-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x i32> [[DATA]], <10 x i32> poison, <2 x i32> <i32 8, i32 9>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q_PART_8]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <10 x i32> %data, ptr addrspace(7) %q, align 8
+ ret void
+}
+
+define i16 @load_i16_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define i16 @load_i16_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[RET:%.*]] = bitcast <2 x i8> [[RET_SLICE_1]] to i16
+; STRICT-NEXT: ret i16 [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define i16 @load_i16_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret i16 [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define i16 @load_i16_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <2 x i8> [[RET_SLICE_1]] to i16
+; RELAXED_OOB_ONLY-NEXT: ret i16 [[RET]]
+;
+; BOTH_FLAGS-LABEL: define i16 @load_i16_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret i16 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load i16, ptr addrspace(7) %q, align 1
+ ret i16 %ret
+}
+
+define void @store_i16_align1(i16 %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_i16_align1(
+; STRICT-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast i16 [[DATA]] to <2 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_i16_align1(
+; UNALIGNED_ONLY-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_i16_align1(
+; RELAXED_OOB_ONLY-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast i16 [[DATA]] to <2 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_i16_align1(
+; BOTH_FLAGS-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store i16 %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <2 x i16> @load_v2i16_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i16> @load_v2i16_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; STRICT-NEXT: ret <2 x i16> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i16> @load_v2i16_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i16> @load_v2i16_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i16> @load_v2i16_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i16> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i16>, ptr addrspace(7) %q, align 4
+ ret <2 x i16> %ret
+}
+
+define void @store_v2i16_align4(<2 x i16> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i16_align4(
+; STRICT-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i16_align4(
+; UNALIGNED_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i16_align4(
+; RELAXED_OOB_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i16_align4(
+; BOTH_FLAGS-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i16> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <2 x i16> @load_v2i16_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i16> @load_v2i16_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; STRICT-NEXT: ret <2 x i16> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i16> @load_v2i16_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i16> @load_v2i16_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i16> @load_v2i16_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i16> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i16>, ptr addrspace(7) %q, align 2
+ ret <2 x i16> %ret
+}
+
+define void @store_v2i16_align2(<2 x i16> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i16_align2(
+; STRICT-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i16_align2(
+; UNALIGNED_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i16_align2(
+; RELAXED_OOB_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i16_align2(
+; BOTH_FLAGS-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i16> %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define <2 x i16> @load_v2i16_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i16> @load_v2i16_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[RET:%.*]] = bitcast <4 x i8> [[RET_SLICE_3]] to <2 x i16>
+; STRICT-NEXT: ret <2 x i16> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i16> @load_v2i16_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i16> @load_v2i16_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <4 x i8> [[RET_SLICE_3]] to <2 x i16>
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i16> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i16> @load_v2i16_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret <2 x i16> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i16>, ptr addrspace(7) %q, align 1
+ ret <2 x i16> %ret
+}
+
+define void @store_v2i16_align1(<2 x i16> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i16_align1(
+; STRICT-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i16> [[DATA]] to <4 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i16_align1(
+; UNALIGNED_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i16_align1(
+; RELAXED_OOB_ONLY-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i16> [[DATA]] to <4 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i16_align1(
+; BOTH_FLAGS-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i16> %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <2 x i8> @load_v2i8_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i8> @load_v2i8_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: ret <2 x i8> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i8> @load_v2i8_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i8> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i8> @load_v2i8_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i8> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i8> @load_v2i8_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
+; BOTH_FLAGS-NEXT: ret <2 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i8>, ptr addrspace(7) %q, align 2
+ ret <2 x i8> %ret
+}
+
+define void @store_v2i8_align2(<2 x i8> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i8_align2(
+; STRICT-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i8_align2(
+; UNALIGNED_ONLY-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i8_align2(
+; RELAXED_OOB_ONLY-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i8_align2(
+; BOTH_FLAGS-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i8> %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define <2 x i8> @load_v2i8_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <2 x i8> @load_v2i8_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: ret <2 x i8> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <2 x i8> @load_v2i8_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: ret <2 x i8> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <2 x i8> @load_v2i8_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: ret <2 x i8> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <2 x i8> @load_v2i8_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
+; BOTH_FLAGS-NEXT: ret <2 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <2 x i8>, ptr addrspace(7) %q, align 1
+ ret <2 x i8> %ret
+}
+
+define void @store_v2i8_align1(<2 x i8> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v2i8_align1(
+; STRICT-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v2i8_align1(
+; UNALIGNED_ONLY-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v2i8_align1(
+; RELAXED_OOB_ONLY-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v2i8_align1(
+; BOTH_FLAGS-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <2 x i8> %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <4 x i8> @load_v4i8_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <4 x i8> @load_v4i8_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: ret <4 x i8> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <4 x i8> @load_v4i8_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <4 x i8> @load_v4i8_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; RELAXED_OOB_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <4 x i8> @load_v4i8_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; BOTH_FLAGS-NEXT: ret <4 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <4 x i8>, ptr addrspace(7) %q, align 4
+ ret <4 x i8> %ret
+}
+
+define void @store_v4i8_align4(<4 x i8> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v4i8_align4(
+; STRICT-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v4i8_align4(
+; UNALIGNED_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v4i8_align4(
+; RELAXED_OOB_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v4i8_align4(
+; BOTH_FLAGS-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <4 x i8> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <4 x i8> @load_v4i8_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <4 x i8> @load_v4i8_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: ret <4 x i8> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <4 x i8> @load_v4i8_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <4 x i8> @load_v4i8_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <2 x i16> [[RET_SLICE_1]] to <4 x i8>
+; RELAXED_OOB_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <4 x i8> @load_v4i8_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; BOTH_FLAGS-NEXT: ret <4 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <4 x i8>, ptr addrspace(7) %q, align 2
+ ret <4 x i8> %ret
+}
+
+define void @store_v4i8_align2(<4 x i8> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v4i8_align2(
+; STRICT-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v4i8_align2(
+; UNALIGNED_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v4i8_align2(
+; RELAXED_OOB_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to <2 x i16>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v4i8_align2(
+; BOTH_FLAGS-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <4 x i8> %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define <4 x i8> @load_v4i8_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <4 x i8> @load_v4i8_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: ret <4 x i8> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <4 x i8> @load_v4i8_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <4 x i8> @load_v4i8_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: ret <4 x i8> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <4 x i8> @load_v4i8_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; BOTH_FLAGS-NEXT: ret <4 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <4 x i8>, ptr addrspace(7) %q, align 1
+ ret <4 x i8> %ret
+}
+
+define void @store_v4i8_align1(<4 x i8> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v4i8_align1(
+; STRICT-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v4i8_align1(
+; UNALIGNED_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v4i8_align1(
+; RELAXED_OOB_ONLY-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v4i8_align1(
+; BOTH_FLAGS-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <4 x i8> %data, ptr addrspace(7) %q, align 1
+ ret void
+}
+
+define <16 x i4> @load_v16i4_align8(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <16 x i4> @load_v16i4_align8(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; STRICT-NEXT: ret <16 x i4> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <16 x i4> @load_v16i4_align8(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; UNALIGNED_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <16 x i4> @load_v16i4_align8(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; RELAXED_OOB_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <16 x i4> @load_v16i4_align8(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; BOTH_FLAGS-NEXT: ret <16 x i4> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <16 x i4>, ptr addrspace(7) %q, align 8
+ ret <16 x i4> %ret
+}
+
+define void @store_v16i4_align8(<16 x i4> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v16i4_align8(
+; STRICT-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v16i4_align8(
+; UNALIGNED_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v16i4_align8(
+; RELAXED_OOB_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v16i4_align8(
+; BOTH_FLAGS-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <16 x i4> %data, ptr addrspace(7) %q, align 8
+ ret void
+}
+
+define <16 x i4> @load_v16i4_align4(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <16 x i4> @load_v16i4_align4(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; STRICT-NEXT: ret <16 x i4> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <16 x i4> @load_v16i4_align4(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; UNALIGNED_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <16 x i4> @load_v16i4_align4(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; RELAXED_OOB_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <16 x i4> @load_v16i4_align4(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; BOTH_FLAGS-NEXT: ret <16 x i4> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <16 x i4>, ptr addrspace(7) %q, align 4
+ ret <16 x i4> %ret
+}
+
+define void @store_v16i4_align4(<16 x i4> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v16i4_align4(
+; STRICT-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v16i4_align4(
+; UNALIGNED_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v16i4_align4(
+; RELAXED_OOB_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v16i4_align4(
+; BOTH_FLAGS-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <16 x i4> %data, ptr addrspace(7) %q, align 4
+ ret void
+}
+
+define <16 x i4> @load_v16i4_align2(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <16 x i4> @load_v16i4_align2(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; STRICT-NEXT: ret <16 x i4> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <16 x i4> @load_v16i4_align2(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; UNALIGNED_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <16 x i4> @load_v16i4_align2(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <4 x i16> [[RET_SLICE_3]] to <16 x i4>
+; RELAXED_OOB_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <16 x i4> @load_v16i4_align2(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; BOTH_FLAGS-NEXT: ret <16 x i4> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <16 x i4>, ptr addrspace(7) %q, align 2
+ ret <16 x i4> %ret
+}
+
+define void @store_v16i4_align2(<16 x i4> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v16i4_align2(
+; STRICT-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v16i4_align2(
+; UNALIGNED_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v16i4_align2(
+; RELAXED_OOB_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <4 x i16>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i16> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v16i4_align2(
+; BOTH_FLAGS-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <16 x i4> %data, ptr addrspace(7) %q, align 2
+ ret void
+}
+
+define <16 x i4> @load_v16i4_align1(ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define <16 x i4> @load_v16i4_align1(
+; STRICT-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; STRICT-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; STRICT-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; STRICT-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; STRICT-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; STRICT-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; STRICT-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; STRICT-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; STRICT-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; STRICT-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; STRICT-NEXT: ret <16 x i4> [[RET]]
+;
+; UNALIGNED_ONLY-LABEL: define <16 x i4> @load_v16i4_align1(
+; UNALIGNED_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; UNALIGNED_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; UNALIGNED_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; UNALIGNED_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; RELAXED_OOB_ONLY-LABEL: define <16 x i4> @load_v16i4_align1(
+; RELAXED_OOB_ONLY-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; RELAXED_OOB_ONLY-NEXT: [[Q_OFF_PTR_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 [[Q_OFF_PTR_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[RET_SLICE_7:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; RELAXED_OOB_ONLY-NEXT: [[RET:%.*]] = bitcast <8 x i8> [[RET_SLICE_7]] to <16 x i4>
+; RELAXED_OOB_ONLY-NEXT: ret <16 x i4> [[RET]]
+;
+; BOTH_FLAGS-LABEL: define <16 x i4> @load_v16i4_align1(
+; BOTH_FLAGS-SAME: ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <16 x i4>
+; BOTH_FLAGS-NEXT: ret <16 x i4> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ %ret = load <16 x i4>, ptr addrspace(7) %q, align 1
+ ret <16 x i4> %ret
+}
+
+define void @store_v16i4_align1(<16 x i4> %data, ptr addrspace(8) inreg %buf, i32 %off) {
+; STRICT-LABEL: define void @store_v16i4_align1(
+; STRICT-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; STRICT-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; STRICT-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; STRICT-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; STRICT-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; STRICT-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; STRICT-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; STRICT-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; STRICT-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; STRICT-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; STRICT-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; STRICT-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; STRICT-NEXT: ret void
+;
+; UNALIGNED_ONLY-LABEL: define void @store_v16i4_align1(
+; UNALIGNED_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; UNALIGNED_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_1:%.*]] = add i32 [[Q]], 1
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_2:%.*]] = add i32 [[Q]], 2
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_3:%.*]] = add i32 [[Q]], 3
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_4:%.*]] = add i32 [[Q]], 4
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_5:%.*]] = add i32 [[Q]], 5
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_6:%.*]] = add i32 [[Q]], 6
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: [[Q_PART_7:%.*]] = add i32 [[Q]], 7
+; UNALIGNED_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; UNALIGNED_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; UNALIGNED_ONLY-NEXT: ret void
+;
+; RELAXED_OOB_ONLY-LABEL: define void @store_v16i4_align1(
+; RELAXED_OOB_ONLY-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; RELAXED_OOB_ONLY-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <8 x i8>
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 0
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_1:%.*]] = add nuw i32 [[Q]], 1
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 1
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_1]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_2:%.*]] = add nuw i32 [[Q]], 2
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 2
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_2]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_3:%.*]] = add nuw i32 [[Q]], 3
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 3
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_3]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_4:%.*]] = add nuw i32 [[Q]], 4
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 4
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_4]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_5:%.*]] = add nuw i32 [[Q]], 5
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 5
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_5]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_6:%.*]] = add nuw i32 [[Q]], 6
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 6
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_6]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: [[Q_PART_7:%.*]] = add nuw i32 [[Q]], 7
+; RELAXED_OOB_ONLY-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA_LEGAL]], i64 7
+; RELAXED_OOB_ONLY-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q_PART_7]], i32 0, i32 0)
+; RELAXED_OOB_ONLY-NEXT: ret void
+;
+; BOTH_FLAGS-LABEL: define void @store_v16i4_align1(
+; BOTH_FLAGS-SAME: <16 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]], i32 [[Q:%.*]]) #[[ATTR0]] {
+; BOTH_FLAGS-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i4> [[DATA]] to <2 x i32>
+; BOTH_FLAGS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 [[Q]], i32 0, i32 0)
+; BOTH_FLAGS-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %q = getelementptr i8, ptr addrspace(7) %p, i32 %off
+ store <16 x i4> %data, ptr addrspace(7) %q, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index a8e67a4a61816..e8e9855026691 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -212,7 +212,10 @@ define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i16> @load_v2i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <2 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
; CHECK-NEXT: ret <2 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -223,7 +226,10 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i16(
; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -234,7 +240,14 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i16> @load_v4i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <4 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
; CHECK-NEXT: ret <4 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -245,7 +258,14 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i16(
; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -256,7 +276,22 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i16> @load_v8i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i16> [[RET_SLICE_3]], i16 [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i16> [[RET_SLICE_4]], i16 [[RET_OFF_10]], i64 5
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i16> [[RET_SLICE_5]], i16 [[RET_OFF_12]], i64 6
+; CHECK-NEXT: [[RET_OFF_14:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x i16> [[RET_SLICE_6]], i16 [[RET_OFF_14]], i64 7
; CHECK-NEXT: ret <8 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -267,7 +302,22 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i16(
; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i16> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i16> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i16> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_7]], ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -344,7 +394,10 @@ define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x half> @load_v2f16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x half> poison, half [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <2 x half> [[RET_SLICE_0]], half [[RET_OFF_2]], i64 1
; CHECK-NEXT: ret <2 x half> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -355,7 +408,10 @@ define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2f16(
; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x half> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x half> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -363,10 +419,45 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <2 x half> @load_v2f16_align2(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <2 x half> @load_v2f16_align2(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x half> poison, half [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <2 x half> [[RET_SLICE_0]], half [[RET_OFF_2]], i64 1
+; CHECK-NEXT: ret <2 x half> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x half>, ptr addrspace(7) %p, align 2
+ ret <2 x half> %ret
+}
+
+define void @store_v2f16_align2(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_v2f16_align2(
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x half> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x half> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x half> %data, ptr addrspace(7) %p, align 2
+ ret void
+}
+
define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x bfloat> poison, bfloat [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x bfloat> [[RET_SLICE_0]], bfloat [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x bfloat> [[RET_SLICE_1]], bfloat [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <4 x bfloat> [[RET_SLICE_2]], bfloat [[RET_OFF_6]], i64 3
; CHECK-NEXT: ret <4 x bfloat> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -377,7 +468,14 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4bf16(
; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x bfloat> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x bfloat> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x bfloat> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x bfloat> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -388,7 +486,22 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x half> @load_v8f16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x half> poison, half [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x half> [[RET_SLICE_0]], half [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x half> [[RET_SLICE_1]], half [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x half> [[RET_SLICE_2]], half [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x half> [[RET_SLICE_3]], half [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x half> [[RET_SLICE_4]], half [[RET_OFF_10]], i64 5
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x half> [[RET_SLICE_5]], half [[RET_OFF_12]], i64 6
+; CHECK-NEXT: [[RET_OFF_14:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x half> [[RET_SLICE_6]], half [[RET_OFF_14]], i64 7
; CHECK-NEXT: ret <8 x half> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -399,7 +512,22 @@ define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8f16(
; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x half> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x half> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x half> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x half> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x half> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x half> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x half> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x half> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_7]], ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -764,8 +892,18 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %bu
define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x half> @load_v6f16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <6 x half> poison, half [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <6 x half> [[RET_SLICE_0]], half [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <6 x half> [[RET_SLICE_1]], half [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <6 x half> [[RET_SLICE_2]], half [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <6 x half> [[RET_SLICE_3]], half [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <6 x half> [[RET_SLICE_4]], half [[RET_OFF_10]], i64 5
; CHECK-NEXT: ret <6 x half> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -776,8 +914,18 @@ define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6f16(
; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <6 x half> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <6 x half> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <6 x half> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <6 x half> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <6 x half> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <6 x half> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1030,9 +1178,10 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i16> @load_v3i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <3 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <3 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
; CHECK-NEXT: ret <3 x i16> [[RET]]
@@ -1045,8 +1194,10 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3i16(
; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <3 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <3 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1059,9 +1210,14 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <5 x i16> @load_v5i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_OFF_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <5 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <5 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <5 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <5 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_8]], i64 4
; CHECK-NEXT: ret <5 x i16> [[RET]]
@@ -1074,8 +1230,14 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v5i16(
; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <5 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <5 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <5 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <5 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1088,8 +1250,18 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i16> @load_v6i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <6 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <6 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <6 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <6 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <6 x i16> [[RET_SLICE_3]], i16 [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <6 x i16> [[RET_SLICE_4]], i16 [[RET_OFF_10]], i64 5
; CHECK-NEXT: ret <6 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1100,8 +1272,18 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6i16(
; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <6 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <6 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <6 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <6 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <6 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <6 x i16> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1112,10 +1294,18 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <7 x i16> @load_v7i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_OFF_0]] to <6 x i16>
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_OFF_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <7 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <7 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <7 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <7 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <7 x i16> [[RET_SLICE_3]], i16 [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <7 x i16> [[RET_SLICE_4]], i16 [[RET_OFF_10]], i64 5
; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_12]], i64 6
; CHECK-NEXT: ret <7 x i16> [[RET]]
@@ -1128,9 +1318,18 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v7i16(
; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <7 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <7 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <7 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <7 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <7 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <7 x i16> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i16> [[DATA]], i64 6
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1143,9 +1342,22 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <9 x i16> @load_v9i16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_OFF_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <9 x i16> poison, i16 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <9 x i16> [[RET_SLICE_0]], i16 [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <9 x i16> [[RET_SLICE_1]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <9 x i16> [[RET_SLICE_2]], i16 [[RET_OFF_6]], i64 3
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <9 x i16> [[RET_SLICE_3]], i16 [[RET_OFF_8]], i64 4
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <9 x i16> [[RET_SLICE_4]], i16 [[RET_OFF_10]], i64 5
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <9 x i16> [[RET_SLICE_5]], i16 [[RET_OFF_12]], i64 6
+; CHECK-NEXT: [[RET_OFF_14:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <9 x i16> [[RET_SLICE_6]], i16 [[RET_OFF_14]], i64 7
; CHECK-NEXT: [[RET_OFF_16:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_16]], i64 8
; CHECK-NEXT: ret <9 x i16> [[RET]]
@@ -1158,8 +1370,22 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v9i16(
; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <9 x i16> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <9 x i16> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <9 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <9 x i16> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_3]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <9 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <9 x i16> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_5]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <9 x i16> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <9 x i16> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_7]], ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1200,8 +1426,10 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i8> @load_v2i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
; CHECK-NEXT: ret <2 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1212,8 +1440,10 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i8(
; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1224,10 +1454,10 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i8> @load_v3i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_0]] to <2 x i8>
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_OFF_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <3 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <3 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_2]], i64 2
; CHECK-NEXT: ret <3 x i8> [[RET]]
@@ -1240,9 +1470,10 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3i8(
; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <3 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <3 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i8> [[DATA]], i64 2
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1255,8 +1486,14 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i8> @load_v4i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1267,8 +1504,14 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i8(
; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1279,10 +1522,14 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <5 x i8> @load_v5i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <5 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <5 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <5 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = insertelement <5 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
; CHECK-NEXT: ret <5 x i8> [[RET]]
@@ -1295,9 +1542,14 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v5i8(
; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <5 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <5 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <5 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <5 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA]], i64 4
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1310,12 +1562,18 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i8> @load_v6i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
-; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <6 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <6 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <6 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <6 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <6 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <6 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1326,11 +1584,18 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6i8(
; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <6 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <6 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <6 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <6 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <6 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <6 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1341,14 +1606,18 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <7 x i8> @load_v7i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT: [[RET_OFF_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_4]] to <2 x i8>
-; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_OFF_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <7 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <7 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <7 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <7 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <7 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = insertelement <7 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_OFF_6]], i64 6
; CHECK-NEXT: ret <7 x i8> [[RET]]
@@ -1361,12 +1630,18 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v7i8(
; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT: [[DATA_SLICE_4_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_4]] to i16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <7 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <7 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <7 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <7 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <7 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <7 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i8> [[DATA]], i64 6
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: ret void
@@ -1379,8 +1654,22 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i8> @load_v8i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
; CHECK-NEXT: ret <8 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1391,8 +1680,22 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i8(
; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1400,11 +1703,85 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
ret void
}
+define <8 x i8> @load_v8i8_align1(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <8 x i8> @load_v8i8_align1(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <8 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <8 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <8 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <8 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <8 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <8 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <8 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <8 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; CHECK-NEXT: ret <8 x i8> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i8>, ptr addrspace(7) %p, align 1
+ ret <8 x i8> %ret
+}
+
+define void @store_v8i8_align1(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_v8i8_align1(
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <8 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <8 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <8 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <8 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <8 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 1 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <8 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <8 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 1 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <8 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i8> %data, ptr addrspace(7) %p, align 1
+ ret void
+}
+
define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <12 x i8> @load_v12i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <12 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <12 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <12 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <12 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <12 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <12 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <12 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_7:%.*]] = insertelement <12 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_8:%.*]] = insertelement <12 x i8> [[RET_SLICE_7]], i8 [[RET_OFF_8]], i64 8
+; CHECK-NEXT: [[RET_OFF_9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_9:%.*]] = insertelement <12 x i8> [[RET_SLICE_8]], i8 [[RET_OFF_9]], i64 9
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_10:%.*]] = insertelement <12 x i8> [[RET_SLICE_9]], i8 [[RET_OFF_10]], i64 10
+; CHECK-NEXT: [[RET_OFF_11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <12 x i8> [[RET_SLICE_10]], i8 [[RET_OFF_11]], i64 11
; CHECK-NEXT: ret <12 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1415,8 +1792,30 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v12i8(
; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <12 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <12 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <12 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <12 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <12 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <12 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <12 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <12 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <12 x i8> [[DATA]], i64 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_9:%.*]] = extractelement <12 x i8> [[DATA]], i64 9
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_9]], ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_10:%.*]] = extractelement <12 x i8> [[DATA]], i64 10
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_10]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_11:%.*]] = extractelement <12 x i8> [[DATA]], i64 11
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_11]], ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1427,8 +1826,38 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <16 x i8> @load_v16i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <16 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <16 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <16 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <16 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <16 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <16 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <16 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_7:%.*]] = insertelement <16 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_8:%.*]] = insertelement <16 x i8> [[RET_SLICE_7]], i8 [[RET_OFF_8]], i64 8
+; CHECK-NEXT: [[RET_OFF_9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_9:%.*]] = insertelement <16 x i8> [[RET_SLICE_8]], i8 [[RET_OFF_9]], i64 9
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_10:%.*]] = insertelement <16 x i8> [[RET_SLICE_9]], i8 [[RET_OFF_10]], i64 10
+; CHECK-NEXT: [[RET_OFF_11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_11:%.*]] = insertelement <16 x i8> [[RET_SLICE_10]], i8 [[RET_OFF_11]], i64 11
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_12:%.*]] = insertelement <16 x i8> [[RET_SLICE_11]], i8 [[RET_OFF_12]], i64 12
+; CHECK-NEXT: [[RET_OFF_13:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 13, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_13:%.*]] = insertelement <16 x i8> [[RET_SLICE_12]], i8 [[RET_OFF_13]], i64 13
+; CHECK-NEXT: [[RET_OFF_14:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_14:%.*]] = insertelement <16 x i8> [[RET_SLICE_13]], i8 [[RET_OFF_14]], i64 14
+; CHECK-NEXT: [[RET_OFF_15:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 15, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <16 x i8> [[RET_SLICE_14]], i8 [[RET_OFF_15]], i64 15
; CHECK-NEXT: ret <16 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1439,8 +1868,38 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v16i8(
; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <16 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <16 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <16 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <16 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <16 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <16 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <16 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <16 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <16 x i8> [[DATA]], i64 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_9:%.*]] = extractelement <16 x i8> [[DATA]], i64 9
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_9]], ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_10:%.*]] = extractelement <16 x i8> [[DATA]], i64 10
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_10]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_11:%.*]] = extractelement <16 x i8> [[DATA]], i64 11
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_11]], ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_12:%.*]] = extractelement <16 x i8> [[DATA]], i64 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_12]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_13:%.*]] = extractelement <16 x i8> [[DATA]], i64 13
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_13]], ptr addrspace(8) align 1 [[BUF]], i32 13, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_14:%.*]] = extractelement <16 x i8> [[DATA]], i64 14
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_14]], ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_15:%.*]] = extractelement <16 x i8> [[DATA]], i64 15
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_15]], ptr addrspace(8) align 1 [[BUF]], i32 15, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1451,13 +1910,70 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <32 x i8> @load_v32i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <32 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <32 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <32 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <32 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <32 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_5:%.*]] = insertelement <32 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_6:%.*]] = insertelement <32 x i8> [[RET_SLICE_5]], i8 [[RET_OFF_6]], i64 6
+; CHECK-NEXT: [[RET_OFF_7:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_7:%.*]] = insertelement <32 x i8> [[RET_SLICE_6]], i8 [[RET_OFF_7]], i64 7
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_8:%.*]] = insertelement <32 x i8> [[RET_SLICE_7]], i8 [[RET_OFF_8]], i64 8
+; CHECK-NEXT: [[RET_OFF_9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_9:%.*]] = insertelement <32 x i8> [[RET_SLICE_8]], i8 [[RET_OFF_9]], i64 9
+; CHECK-NEXT: [[RET_OFF_10:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_10:%.*]] = insertelement <32 x i8> [[RET_SLICE_9]], i8 [[RET_OFF_10]], i64 10
+; CHECK-NEXT: [[RET_OFF_11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_11:%.*]] = insertelement <32 x i8> [[RET_SLICE_10]], i8 [[RET_OFF_11]], i64 11
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_12:%.*]] = insertelement <32 x i8> [[RET_SLICE_11]], i8 [[RET_OFF_12]], i64 12
+; CHECK-NEXT: [[RET_OFF_13:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 13, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_13:%.*]] = insertelement <32 x i8> [[RET_SLICE_12]], i8 [[RET_OFF_13]], i64 13
+; CHECK-NEXT: [[RET_OFF_14:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_14:%.*]] = insertelement <32 x i8> [[RET_SLICE_13]], i8 [[RET_OFF_14]], i64 14
+; CHECK-NEXT: [[RET_OFF_15:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 15, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_15:%.*]] = insertelement <32 x i8> [[RET_SLICE_14]], i8 [[RET_OFF_15]], i64 15
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_16:%.*]] = insertelement <32 x i8> [[RET_SLICE_15]], i8 [[RET_OFF_16]], i64 16
+; CHECK-NEXT: [[RET_OFF_17:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 17, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_17:%.*]] = insertelement <32 x i8> [[RET_SLICE_16]], i8 [[RET_OFF_17]], i64 17
+; CHECK-NEXT: [[RET_OFF_18:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 18, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_18:%.*]] = insertelement <32 x i8> [[RET_SLICE_17]], i8 [[RET_OFF_18]], i64 18
+; CHECK-NEXT: [[RET_OFF_19:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 19, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_19:%.*]] = insertelement <32 x i8> [[RET_SLICE_18]], i8 [[RET_OFF_19]], i64 19
+; CHECK-NEXT: [[RET_OFF_20:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 20, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_20:%.*]] = insertelement <32 x i8> [[RET_SLICE_19]], i8 [[RET_OFF_20]], i64 20
+; CHECK-NEXT: [[RET_OFF_21:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 21, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_21:%.*]] = insertelement <32 x i8> [[RET_SLICE_20]], i8 [[RET_OFF_21]], i64 21
+; CHECK-NEXT: [[RET_OFF_22:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 22, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_22:%.*]] = insertelement <32 x i8> [[RET_SLICE_21]], i8 [[RET_OFF_22]], i64 22
+; CHECK-NEXT: [[RET_OFF_23:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 23, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_23:%.*]] = insertelement <32 x i8> [[RET_SLICE_22]], i8 [[RET_OFF_23]], i64 23
+; CHECK-NEXT: [[RET_OFF_24:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 24, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_24:%.*]] = insertelement <32 x i8> [[RET_SLICE_23]], i8 [[RET_OFF_24]], i64 24
+; CHECK-NEXT: [[RET_OFF_25:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 25, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_25:%.*]] = insertelement <32 x i8> [[RET_SLICE_24]], i8 [[RET_OFF_25]], i64 25
+; CHECK-NEXT: [[RET_OFF_26:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 26, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_26:%.*]] = insertelement <32 x i8> [[RET_SLICE_25]], i8 [[RET_OFF_26]], i64 26
+; CHECK-NEXT: [[RET_OFF_27:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 27, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_27:%.*]] = insertelement <32 x i8> [[RET_SLICE_26]], i8 [[RET_OFF_27]], i64 27
+; CHECK-NEXT: [[RET_OFF_28:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 28, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_28:%.*]] = insertelement <32 x i8> [[RET_SLICE_27]], i8 [[RET_OFF_28]], i64 28
+; CHECK-NEXT: [[RET_OFF_29:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 29, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_29:%.*]] = insertelement <32 x i8> [[RET_SLICE_28]], i8 [[RET_OFF_29]], i64 29
+; CHECK-NEXT: [[RET_OFF_30:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 30, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_30:%.*]] = insertelement <32 x i8> [[RET_SLICE_29]], i8 [[RET_OFF_30]], i64 30
+; CHECK-NEXT: [[RET_OFF_31:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 31, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <32 x i8> [[RET_SLICE_30]], i8 [[RET_OFF_31]], i64 31
; CHECK-NEXT: ret <32 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1468,11 +1984,70 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v32i8(
; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <32 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <32 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <32 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <32 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <32 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <32 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <32 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_7:%.*]] = extractelement <32 x i8> [[DATA]], i64 7
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_7]], ptr addrspace(8) align 1 [[BUF]], i32 7, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <32 x i8> [[DATA]], i64 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_8]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_9:%.*]] = extractelement <32 x i8> [[DATA]], i64 9
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_9]], ptr addrspace(8) align 1 [[BUF]], i32 9, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_10:%.*]] = extractelement <32 x i8> [[DATA]], i64 10
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_10]], ptr addrspace(8) align 2 [[BUF]], i32 10, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_11:%.*]] = extractelement <32 x i8> [[DATA]], i64 11
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_11]], ptr addrspace(8) align 1 [[BUF]], i32 11, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_12:%.*]] = extractelement <32 x i8> [[DATA]], i64 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_12]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_13:%.*]] = extractelement <32 x i8> [[DATA]], i64 13
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_13]], ptr addrspace(8) align 1 [[BUF]], i32 13, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_14:%.*]] = extractelement <32 x i8> [[DATA]], i64 14
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_14]], ptr addrspace(8) align 2 [[BUF]], i32 14, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_15:%.*]] = extractelement <32 x i8> [[DATA]], i64 15
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_15]], ptr addrspace(8) align 1 [[BUF]], i32 15, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_16:%.*]] = extractelement <32 x i8> [[DATA]], i64 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_16]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_17:%.*]] = extractelement <32 x i8> [[DATA]], i64 17
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_17]], ptr addrspace(8) align 1 [[BUF]], i32 17, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_18:%.*]] = extractelement <32 x i8> [[DATA]], i64 18
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_18]], ptr addrspace(8) align 2 [[BUF]], i32 18, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_19:%.*]] = extractelement <32 x i8> [[DATA]], i64 19
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_19]], ptr addrspace(8) align 1 [[BUF]], i32 19, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_20:%.*]] = extractelement <32 x i8> [[DATA]], i64 20
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_20]], ptr addrspace(8) align 4 [[BUF]], i32 20, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_21:%.*]] = extractelement <32 x i8> [[DATA]], i64 21
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_21]], ptr addrspace(8) align 1 [[BUF]], i32 21, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_22:%.*]] = extractelement <32 x i8> [[DATA]], i64 22
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_22]], ptr addrspace(8) align 2 [[BUF]], i32 22, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_23:%.*]] = extractelement <32 x i8> [[DATA]], i64 23
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_23]], ptr addrspace(8) align 1 [[BUF]], i32 23, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_24:%.*]] = extractelement <32 x i8> [[DATA]], i64 24
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_24]], ptr addrspace(8) align 8 [[BUF]], i32 24, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_25:%.*]] = extractelement <32 x i8> [[DATA]], i64 25
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_25]], ptr addrspace(8) align 1 [[BUF]], i32 25, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_26:%.*]] = extractelement <32 x i8> [[DATA]], i64 26
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_26]], ptr addrspace(8) align 2 [[BUF]], i32 26, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_27:%.*]] = extractelement <32 x i8> [[DATA]], i64 27
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_27]], ptr addrspace(8) align 1 [[BUF]], i32 27, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_28:%.*]] = extractelement <32 x i8> [[DATA]], i64 28
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_28]], ptr addrspace(8) align 4 [[BUF]], i32 28, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_29:%.*]] = extractelement <32 x i8> [[DATA]], i64 29
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_29]], ptr addrspace(8) align 1 [[BUF]], i32 29, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_30:%.*]] = extractelement <32 x i8> [[DATA]], i64 30
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_30]], ptr addrspace(8) align 2 [[BUF]], i32 30, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_31:%.*]] = extractelement <32 x i8> [[DATA]], i64 31
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_31]], ptr addrspace(8) align 1 [[BUF]], i32 31, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1513,11 +2088,12 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x i32] @load_a2i32(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
-; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
-; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 1
-; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_ELEM_1]], 1
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = insertelement <2 x i32> [[RET_SLICE_0]], i32 [[RET_OFF_4]], i64 1
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_OFF_0]], 0
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_OFF_4]], 1
; CHECK-NEXT: ret [2 x i32] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1532,7 +2108,8 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA_AS_VEC_0]], i32 [[DATA_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_ELEM_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_ELEM_1]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1543,11 +2120,12 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x half] @load_a2f16(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
-; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
-; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
-; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_ELEM_1]], 1
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x half> poison, half [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = insertelement <2 x half> [[RET_SLICE_0]], half [[RET_OFF_2]], i64 1
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_OFF_0]], 0
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_OFF_2]], 1
; CHECK-NEXT: ret [2 x half] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1562,7 +2140,8 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_ELEM_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA_ELEM_1]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1573,11 +2152,12 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
-; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
-; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 1
-; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_ELEM_1]], 1
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = insertelement <2 x ptr addrspace(1)> [[RET_SLICE_0]], ptr addrspace(1) [[RET_OFF_8]], i64 1
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_OFF_0]], 0
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_OFF_8]], 1
; CHECK-NEXT: ret [2 x ptr addrspace(1)] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1592,7 +2172,8 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %bu
; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x ptr addrspace(1)> [[DATA_AS_VEC_0]], ptr addrspace(1) [[DATA_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_AS_VEC_1]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA_ELEM_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA_ELEM_1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1801,8 +2382,11 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i4> @load_v4i4(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <2 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <2 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i8> [[RET_SLICE_1]] to <4 x i4>
; CHECK-NEXT: ret <4 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1813,8 +2397,11 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i4(
; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to <2 x i8>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <2 x i8> [[DATA_LEGAL]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1825,8 +2412,15 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i4> @load_v8i4(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET:%.*]] = bitcast <4 x i8> [[RET_SLICE_3]] to <8 x i4>
; CHECK-NEXT: ret <8 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1837,8 +2431,15 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i4(
; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to <4 x i8>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA_LEGAL]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1878,10 +2479,10 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i32> @load_v32i6(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
@@ -1889,7 +2490,7 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
; CHECK-NEXT: ret <6 x i32> [[RET_CAST]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- %ret = load <32 x i6>, ptr addrspace(7) %p
+ %ret = load <32 x i6>, ptr addrspace(7) %p, align 4
%ret.cast = bitcast <32 x i6> %ret to <6 x i32>
ret <6 x i32> %ret.cast
}
@@ -1899,14 +2500,14 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%data = bitcast <6 x i32> %data.abi to <32 x i6>
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
- store <32 x i6> %data, ptr addrspace(7) %p
+ store <32 x i6> %data, ptr addrspace(7) %p, align 4
ret void
}
@@ -1915,8 +2516,14 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
-; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <4 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <4 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <4 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <4 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1927,8 +2534,14 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @volatile_store_v4i8(
; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <4 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <4 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <4 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <4 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1939,12 +2552,18 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
-; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
-; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
-; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_0:%.*]] = insertelement <6 x i8> poison, i8 [[RET_OFF_0]], i64 0
+; CHECK-NEXT: [[RET_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_1:%.*]] = insertelement <6 x i8> [[RET_SLICE_0]], i8 [[RET_OFF_1]], i64 1
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <6 x i8> [[RET_SLICE_1]], i8 [[RET_OFF_2]], i64 2
+; CHECK-NEXT: [[RET_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_3:%.*]] = insertelement <6 x i8> [[RET_SLICE_2]], i8 [[RET_OFF_3]], i64 3
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <6 x i8> [[RET_SLICE_3]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET_OFF_5:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <6 x i8> [[RET_SLICE_4]], i8 [[RET_OFF_5]], i64 5
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1955,11 +2574,18 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @volatile_store_v6i8(
; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
-; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
-; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = extractelement <6 x i8> [[DATA]], i64 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_1:%.*]] = extractelement <6 x i8> [[DATA]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_1]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <6 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_3:%.*]] = extractelement <6 x i8> [[DATA]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_3]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <6 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_5:%.*]] = extractelement <6 x i8> [[DATA]], i64 5
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_5]], ptr addrspace(8) align 1 [[BUF]], i32 5, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1970,17 +2596,19 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x [2 x i32]] @load_a2a2i32(
; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[RET0_ELEM_0:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 0
-; CHECK-NEXT: [[RET0_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET0_ELEM_0]], 0
-; CHECK-NEXT: [[RET0_ELEM_1:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 1
-; CHECK-NEXT: [[RET0_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET0_AS_ARRAY_0]], i32 [[RET0_ELEM_1]], 1
+; CHECK-NEXT: [[RET0_OFF_1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET0_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET0_OFF_1]], i64 0
+; CHECK-NEXT: [[RET0_OFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET0_OFF_0:%.*]] = insertelement <2 x i32> [[RET0_SLICE_0]], i32 [[RET0_OFF_4]], i64 1
+; CHECK-NEXT: [[RET0_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET0_OFF_1]], 0
+; CHECK-NEXT: [[RET0_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET0_AS_ARRAY_0]], i32 [[RET0_OFF_4]], 1
; CHECK-NEXT: [[RET0:%.*]] = insertvalue [2 x [2 x i32]] poison, [2 x i32] [[RET0_AS_ARRAY_1]], 0
-; CHECK-NEXT: [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
-; CHECK-NEXT: [[RET1_ELEM_0:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 0
-; CHECK-NEXT: [[RET1_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET1_ELEM_0]], 0
-; CHECK-NEXT: [[RET1_ELEM_1:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 1
-; CHECK-NEXT: [[RET1_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET1_AS_ARRAY_0]], i32 [[RET1_ELEM_1]], 1
+; CHECK-NEXT: [[RET1_OFF_9:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET1_SLICE_0:%.*]] = insertelement <2 x i32> poison, i32 [[RET1_OFF_9]], i64 0
+; CHECK-NEXT: [[RET1_OFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET1_OFF_8:%.*]] = insertelement <2 x i32> [[RET1_SLICE_0]], i32 [[RET1_OFF_12]], i64 1
+; CHECK-NEXT: [[RET1_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET1_OFF_9]], 0
+; CHECK-NEXT: [[RET1_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET1_AS_ARRAY_0]], i32 [[RET1_OFF_12]], 1
; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x [2 x i32]] [[RET0]], [2 x i32] [[RET1_AS_ARRAY_1]], 1
; CHECK-NEXT: ret [2 x [2 x i32]] [[RET]]
;
@@ -1997,13 +2625,15 @@ define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
; CHECK-NEXT: [[DATA0_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA0_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA0_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA0]], 1
; CHECK-NEXT: [[DATA0_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA0_AS_VEC_0]], i32 [[DATA0_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA0_ELEM_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA0_ELEM_1]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: [[DATA1:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 1
; CHECK-NEXT: [[DATA1_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA1]], 0
; CHECK-NEXT: [[DATA1_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA1_ELEM_0]], i64 0
; CHECK-NEXT: [[DATA1_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA1]], 1
; CHECK-NEXT: [[DATA1_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA1_AS_VEC_0]], i32 [[DATA1_ELEM_1]], i64 1
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA1_ELEM_0]], ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA1_ELEM_1]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index ee51b0b84554e..5874192ea7921 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -3,7 +3,7 @@
; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
-target triple = "amdgcn--"
+target triple = "amdgcn-amd-amdhsa"
;; memcpy
@@ -27,112 +27,112 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
@@ -151,12 +151,44 @@ define void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) in
; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0)
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[SRC_OFF]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[SRC_OFF]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[SRC_OFF]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[DST_OFF]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[DST_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[DST_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_05:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_06:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_05]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_433:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_433]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <4 x i32> [[DOTSLICE_06]], i32 [[DOTOFF_48]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_845:%.*]] = add i32 [[TMP2]], 8
+; CHECK-NEXT: [[DOTOFF_811:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_845]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_212:%.*]] = insertelement <4 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_811]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_1257:%.*]] = add i32 [[TMP2]], 12
+; CHECK-NEXT: [[DOTOFF_1214:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_1257]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[DOTSLICE_212]], i32 [[DOTOFF_1214]], i64 3
; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_05]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_478:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_478]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_886:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_811]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_886]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_1294:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_1214]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1294]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false)
@@ -217,117 +249,326 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]]
-; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
-; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
-; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
-; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
-; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
-; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
-; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
-; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
-; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
-; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
-; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
-; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
-; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
-; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
-; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
-; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <64 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[TMP1]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <64 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[TMP1]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <64 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[TMP1]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_3:%.*]] = insertelement <64 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_4:%.*]] = insertelement <64 x i32> [[DOTSLICE_3]], i32 [[DOTOFF_16]], i64 4
+; CHECK-NEXT: [[DOTOFF_PTR_20:%.*]] = add i32 [[TMP1]], 20
+; CHECK-NEXT: [[DOTOFF_20:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_5:%.*]] = insertelement <64 x i32> [[DOTSLICE_4]], i32 [[DOTOFF_20]], i64 5
+; CHECK-NEXT: [[DOTOFF_PTR_24:%.*]] = add i32 [[TMP1]], 24
+; CHECK-NEXT: [[DOTOFF_24:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_6:%.*]] = insertelement <64 x i32> [[DOTSLICE_5]], i32 [[DOTOFF_24]], i64 6
+; CHECK-NEXT: [[DOTOFF_PTR_28:%.*]] = add i32 [[TMP1]], 28
+; CHECK-NEXT: [[DOTOFF_28:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_7:%.*]] = insertelement <64 x i32> [[DOTSLICE_6]], i32 [[DOTOFF_28]], i64 7
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_32:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_8:%.*]] = insertelement <64 x i32> [[DOTSLICE_7]], i32 [[DOTOFF_32]], i64 8
+; CHECK-NEXT: [[DOTOFF_PTR_36:%.*]] = add i32 [[TMP1]], 36
+; CHECK-NEXT: [[DOTOFF_36:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_9:%.*]] = insertelement <64 x i32> [[DOTSLICE_8]], i32 [[DOTOFF_36]], i64 9
+; CHECK-NEXT: [[DOTOFF_PTR_40:%.*]] = add i32 [[TMP1]], 40
+; CHECK-NEXT: [[DOTOFF_40:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_10:%.*]] = insertelement <64 x i32> [[DOTSLICE_9]], i32 [[DOTOFF_40]], i64 10
+; CHECK-NEXT: [[DOTOFF_PTR_44:%.*]] = add i32 [[TMP1]], 44
+; CHECK-NEXT: [[DOTOFF_44:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_11:%.*]] = insertelement <64 x i32> [[DOTSLICE_10]], i32 [[DOTOFF_44]], i64 11
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_12:%.*]] = insertelement <64 x i32> [[DOTSLICE_11]], i32 [[DOTOFF_48]], i64 12
+; CHECK-NEXT: [[DOTOFF_PTR_52:%.*]] = add i32 [[TMP1]], 52
+; CHECK-NEXT: [[DOTOFF_52:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_13:%.*]] = insertelement <64 x i32> [[DOTSLICE_12]], i32 [[DOTOFF_52]], i64 13
+; CHECK-NEXT: [[DOTOFF_PTR_56:%.*]] = add i32 [[TMP1]], 56
+; CHECK-NEXT: [[DOTOFF_56:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_14:%.*]] = insertelement <64 x i32> [[DOTSLICE_13]], i32 [[DOTOFF_56]], i64 14
+; CHECK-NEXT: [[DOTOFF_PTR_60:%.*]] = add i32 [[TMP1]], 60
+; CHECK-NEXT: [[DOTOFF_60:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_60]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_15:%.*]] = insertelement <64 x i32> [[DOTSLICE_14]], i32 [[DOTOFF_60]], i64 15
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_64:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_16:%.*]] = insertelement <64 x i32> [[DOTSLICE_15]], i32 [[DOTOFF_64]], i64 16
+; CHECK-NEXT: [[DOTOFF_PTR_68:%.*]] = add i32 [[TMP1]], 68
+; CHECK-NEXT: [[DOTOFF_68:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_68]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_17:%.*]] = insertelement <64 x i32> [[DOTSLICE_16]], i32 [[DOTOFF_68]], i64 17
+; CHECK-NEXT: [[DOTOFF_PTR_72:%.*]] = add i32 [[TMP1]], 72
+; CHECK-NEXT: [[DOTOFF_72:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_72]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_18:%.*]] = insertelement <64 x i32> [[DOTSLICE_17]], i32 [[DOTOFF_72]], i64 18
+; CHECK-NEXT: [[DOTOFF_PTR_76:%.*]] = add i32 [[TMP1]], 76
+; CHECK-NEXT: [[DOTOFF_76:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_76]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <64 x i32> [[DOTSLICE_18]], i32 [[DOTOFF_76]], i64 19
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_80:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_20:%.*]] = insertelement <64 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_80]], i64 20
+; CHECK-NEXT: [[DOTOFF_PTR_84:%.*]] = add i32 [[TMP1]], 84
+; CHECK-NEXT: [[DOTOFF_84:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_84]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_21:%.*]] = insertelement <64 x i32> [[DOTSLICE_20]], i32 [[DOTOFF_84]], i64 21
+; CHECK-NEXT: [[DOTOFF_PTR_88:%.*]] = add i32 [[TMP1]], 88
+; CHECK-NEXT: [[DOTOFF_88:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_88]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_22:%.*]] = insertelement <64 x i32> [[DOTSLICE_21]], i32 [[DOTOFF_88]], i64 22
+; CHECK-NEXT: [[DOTOFF_PTR_92:%.*]] = add i32 [[TMP1]], 92
+; CHECK-NEXT: [[DOTOFF_92:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_92]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_23:%.*]] = insertelement <64 x i32> [[DOTSLICE_22]], i32 [[DOTOFF_92]], i64 23
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_96:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_24:%.*]] = insertelement <64 x i32> [[DOTSLICE_23]], i32 [[DOTOFF_96]], i64 24
+; CHECK-NEXT: [[DOTOFF_PTR_100:%.*]] = add i32 [[TMP1]], 100
+; CHECK-NEXT: [[DOTOFF_100:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_100]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_25:%.*]] = insertelement <64 x i32> [[DOTSLICE_24]], i32 [[DOTOFF_100]], i64 25
+; CHECK-NEXT: [[DOTOFF_PTR_104:%.*]] = add i32 [[TMP1]], 104
+; CHECK-NEXT: [[DOTOFF_104:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_104]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_26:%.*]] = insertelement <64 x i32> [[DOTSLICE_25]], i32 [[DOTOFF_104]], i64 26
+; CHECK-NEXT: [[DOTOFF_PTR_108:%.*]] = add i32 [[TMP1]], 108
+; CHECK-NEXT: [[DOTOFF_108:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_108]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_27:%.*]] = insertelement <64 x i32> [[DOTSLICE_26]], i32 [[DOTOFF_108]], i64 27
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_112:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_28:%.*]] = insertelement <64 x i32> [[DOTSLICE_27]], i32 [[DOTOFF_112]], i64 28
+; CHECK-NEXT: [[DOTOFF_PTR_116:%.*]] = add i32 [[TMP1]], 116
+; CHECK-NEXT: [[DOTOFF_116:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_116]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_29:%.*]] = insertelement <64 x i32> [[DOTSLICE_28]], i32 [[DOTOFF_116]], i64 29
+; CHECK-NEXT: [[DOTOFF_PTR_120:%.*]] = add i32 [[TMP1]], 120
+; CHECK-NEXT: [[DOTOFF_120:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_120]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_30:%.*]] = insertelement <64 x i32> [[DOTSLICE_29]], i32 [[DOTOFF_120]], i64 30
+; CHECK-NEXT: [[DOTOFF_PTR_124:%.*]] = add i32 [[TMP1]], 124
+; CHECK-NEXT: [[DOTOFF_124:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_124]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_31:%.*]] = insertelement <64 x i32> [[DOTSLICE_30]], i32 [[DOTOFF_124]], i64 31
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_128:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_32:%.*]] = insertelement <64 x i32> [[DOTSLICE_31]], i32 [[DOTOFF_128]], i64 32
+; CHECK-NEXT: [[DOTOFF_PTR_132:%.*]] = add i32 [[TMP1]], 132
+; CHECK-NEXT: [[DOTOFF_132:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_132]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_33:%.*]] = insertelement <64 x i32> [[DOTSLICE_32]], i32 [[DOTOFF_132]], i64 33
+; CHECK-NEXT: [[DOTOFF_PTR_136:%.*]] = add i32 [[TMP1]], 136
+; CHECK-NEXT: [[DOTOFF_136:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_136]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_34:%.*]] = insertelement <64 x i32> [[DOTSLICE_33]], i32 [[DOTOFF_136]], i64 34
+; CHECK-NEXT: [[DOTOFF_PTR_140:%.*]] = add i32 [[TMP1]], 140
+; CHECK-NEXT: [[DOTOFF_140:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_140]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_35:%.*]] = insertelement <64 x i32> [[DOTSLICE_34]], i32 [[DOTOFF_140]], i64 35
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_144:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_36:%.*]] = insertelement <64 x i32> [[DOTSLICE_35]], i32 [[DOTOFF_144]], i64 36
+; CHECK-NEXT: [[DOTOFF_PTR_148:%.*]] = add i32 [[TMP1]], 148
+; CHECK-NEXT: [[DOTOFF_148:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_148]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_37:%.*]] = insertelement <64 x i32> [[DOTSLICE_36]], i32 [[DOTOFF_148]], i64 37
+; CHECK-NEXT: [[DOTOFF_PTR_152:%.*]] = add i32 [[TMP1]], 152
+; CHECK-NEXT: [[DOTOFF_152:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_152]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_38:%.*]] = insertelement <64 x i32> [[DOTSLICE_37]], i32 [[DOTOFF_152]], i64 38
+; CHECK-NEXT: [[DOTOFF_PTR_156:%.*]] = add i32 [[TMP1]], 156
+; CHECK-NEXT: [[DOTOFF_156:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_156]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_39:%.*]] = insertelement <64 x i32> [[DOTSLICE_38]], i32 [[DOTOFF_156]], i64 39
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_160:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_40:%.*]] = insertelement <64 x i32> [[DOTSLICE_39]], i32 [[DOTOFF_160]], i64 40
+; CHECK-NEXT: [[DOTOFF_PTR_164:%.*]] = add i32 [[TMP1]], 164
+; CHECK-NEXT: [[DOTOFF_164:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_164]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_41:%.*]] = insertelement <64 x i32> [[DOTSLICE_40]], i32 [[DOTOFF_164]], i64 41
+; CHECK-NEXT: [[DOTOFF_PTR_168:%.*]] = add i32 [[TMP1]], 168
+; CHECK-NEXT: [[DOTOFF_168:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_168]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_42:%.*]] = insertelement <64 x i32> [[DOTSLICE_41]], i32 [[DOTOFF_168]], i64 42
+; CHECK-NEXT: [[DOTOFF_PTR_172:%.*]] = add i32 [[TMP1]], 172
+; CHECK-NEXT: [[DOTOFF_172:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_172]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_43:%.*]] = insertelement <64 x i32> [[DOTSLICE_42]], i32 [[DOTOFF_172]], i64 43
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_176:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_44:%.*]] = insertelement <64 x i32> [[DOTSLICE_43]], i32 [[DOTOFF_176]], i64 44
+; CHECK-NEXT: [[DOTOFF_PTR_180:%.*]] = add i32 [[TMP1]], 180
+; CHECK-NEXT: [[DOTOFF_180:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_180]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_45:%.*]] = insertelement <64 x i32> [[DOTSLICE_44]], i32 [[DOTOFF_180]], i64 45
+; CHECK-NEXT: [[DOTOFF_PTR_184:%.*]] = add i32 [[TMP1]], 184
+; CHECK-NEXT: [[DOTOFF_184:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_184]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_46:%.*]] = insertelement <64 x i32> [[DOTSLICE_45]], i32 [[DOTOFF_184]], i64 46
+; CHECK-NEXT: [[DOTOFF_PTR_188:%.*]] = add i32 [[TMP1]], 188
+; CHECK-NEXT: [[DOTOFF_188:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_188]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_47:%.*]] = insertelement <64 x i32> [[DOTSLICE_46]], i32 [[DOTOFF_188]], i64 47
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_192:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_48:%.*]] = insertelement <64 x i32> [[DOTSLICE_47]], i32 [[DOTOFF_192]], i64 48
+; CHECK-NEXT: [[DOTOFF_PTR_196:%.*]] = add i32 [[TMP1]], 196
+; CHECK-NEXT: [[DOTOFF_196:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_196]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_49:%.*]] = insertelement <64 x i32> [[DOTSLICE_48]], i32 [[DOTOFF_196]], i64 49
+; CHECK-NEXT: [[DOTOFF_PTR_200:%.*]] = add i32 [[TMP1]], 200
+; CHECK-NEXT: [[DOTOFF_200:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_200]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_50:%.*]] = insertelement <64 x i32> [[DOTSLICE_49]], i32 [[DOTOFF_200]], i64 50
+; CHECK-NEXT: [[DOTOFF_PTR_204:%.*]] = add i32 [[TMP1]], 204
+; CHECK-NEXT: [[DOTOFF_204:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_204]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_51:%.*]] = insertelement <64 x i32> [[DOTSLICE_50]], i32 [[DOTOFF_204]], i64 51
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_208:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_52:%.*]] = insertelement <64 x i32> [[DOTSLICE_51]], i32 [[DOTOFF_208]], i64 52
+; CHECK-NEXT: [[DOTOFF_PTR_212:%.*]] = add i32 [[TMP1]], 212
+; CHECK-NEXT: [[DOTOFF_212:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_212]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_53:%.*]] = insertelement <64 x i32> [[DOTSLICE_52]], i32 [[DOTOFF_212]], i64 53
+; CHECK-NEXT: [[DOTOFF_PTR_216:%.*]] = add i32 [[TMP1]], 216
+; CHECK-NEXT: [[DOTOFF_216:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_216]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_54:%.*]] = insertelement <64 x i32> [[DOTSLICE_53]], i32 [[DOTOFF_216]], i64 54
+; CHECK-NEXT: [[DOTOFF_PTR_220:%.*]] = add i32 [[TMP1]], 220
+; CHECK-NEXT: [[DOTOFF_220:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_220]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_55:%.*]] = insertelement <64 x i32> [[DOTSLICE_54]], i32 [[DOTOFF_220]], i64 55
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_224:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_56:%.*]] = insertelement <64 x i32> [[DOTSLICE_55]], i32 [[DOTOFF_224]], i64 56
+; CHECK-NEXT: [[DOTOFF_PTR_228:%.*]] = add i32 [[TMP1]], 228
+; CHECK-NEXT: [[DOTOFF_228:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_228]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_57:%.*]] = insertelement <64 x i32> [[DOTSLICE_56]], i32 [[DOTOFF_228]], i64 57
+; CHECK-NEXT: [[DOTOFF_PTR_232:%.*]] = add i32 [[TMP1]], 232
+; CHECK-NEXT: [[DOTOFF_232:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_232]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_58:%.*]] = insertelement <64 x i32> [[DOTSLICE_57]], i32 [[DOTOFF_232]], i64 58
+; CHECK-NEXT: [[DOTOFF_PTR_236:%.*]] = add i32 [[TMP1]], 236
+; CHECK-NEXT: [[DOTOFF_236:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_236]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_59:%.*]] = insertelement <64 x i32> [[DOTSLICE_58]], i32 [[DOTOFF_236]], i64 59
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_240:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_60:%.*]] = insertelement <64 x i32> [[DOTSLICE_59]], i32 [[DOTOFF_240]], i64 60
+; CHECK-NEXT: [[DOTOFF_PTR_244:%.*]] = add i32 [[TMP1]], 244
+; CHECK-NEXT: [[DOTOFF_244:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_244]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_61:%.*]] = insertelement <64 x i32> [[DOTSLICE_60]], i32 [[DOTOFF_244]], i64 61
+; CHECK-NEXT: [[DOTOFF_PTR_248:%.*]] = add i32 [[TMP1]], 248
+; CHECK-NEXT: [[DOTOFF_248:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_248]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_62:%.*]] = insertelement <64 x i32> [[DOTSLICE_61]], i32 [[DOTOFF_248]], i64 62
+; CHECK-NEXT: [[DOTOFF_PTR_252:%.*]] = add i32 [[TMP1]], 252
+; CHECK-NEXT: [[DOTOFF_252:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_252]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i32> [[DOTSLICE_62]], i32 [[DOTOFF_252]], i64 63
; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]]
-; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
-; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
-; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
-; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
-; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
-; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
-; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
-; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
-; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
-; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
-; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
-; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
-; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
-; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
-; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_1:%.*]] = add i32 [[TMP3]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_2:%.*]] = add i32 [[TMP3]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_3:%.*]] = add i32 [[TMP3]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_3]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_5:%.*]] = add i32 [[TMP3]], 20
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_5]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_6:%.*]] = add i32 [[TMP3]], 24
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_6]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_7:%.*]] = add i32 [[TMP3]], 28
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_7]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_9:%.*]] = add i32 [[TMP3]], 36
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_9]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_10:%.*]] = add i32 [[TMP3]], 40
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_10]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_11:%.*]] = add i32 [[TMP3]], 44
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_11]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_13:%.*]] = add i32 [[TMP3]], 52
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_13]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_14:%.*]] = add i32 [[TMP3]], 56
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_14]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_15:%.*]] = add i32 [[TMP3]], 60
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_15]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_64]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_17:%.*]] = add i32 [[TMP3]], 68
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_68]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_17]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_18:%.*]] = add i32 [[TMP3]], 72
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_72]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_18]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_19:%.*]] = add i32 [[TMP3]], 76
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_76]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_19]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_80]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_21:%.*]] = add i32 [[TMP3]], 84
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_84]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_21]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_22:%.*]] = add i32 [[TMP3]], 88
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_88]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_22]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_23:%.*]] = add i32 [[TMP3]], 92
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_92]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_23]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_96]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_25:%.*]] = add i32 [[TMP3]], 100
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_100]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_25]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_26:%.*]] = add i32 [[TMP3]], 104
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_104]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_26]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_27:%.*]] = add i32 [[TMP3]], 108
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_108]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_27]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_112]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_29:%.*]] = add i32 [[TMP3]], 116
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_116]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_29]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_30:%.*]] = add i32 [[TMP3]], 120
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_120]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_30]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_31:%.*]] = add i32 [[TMP3]], 124
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_124]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_31]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_128]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_33:%.*]] = add i32 [[TMP3]], 132
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_132]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_33]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_34:%.*]] = add i32 [[TMP3]], 136
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_136]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_34]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_35:%.*]] = add i32 [[TMP3]], 140
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_140]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_35]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_144]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_37:%.*]] = add i32 [[TMP3]], 148
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_148]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_37]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_38:%.*]] = add i32 [[TMP3]], 152
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_152]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_38]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_39:%.*]] = add i32 [[TMP3]], 156
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_156]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_39]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_160]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_41:%.*]] = add i32 [[TMP3]], 164
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_164]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_41]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_42:%.*]] = add i32 [[TMP3]], 168
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_168]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_42]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_43:%.*]] = add i32 [[TMP3]], 172
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_172]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_43]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_176]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_45:%.*]] = add i32 [[TMP3]], 180
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_180]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_45]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_46:%.*]] = add i32 [[TMP3]], 184
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_184]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_46]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_47:%.*]] = add i32 [[TMP3]], 188
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_188]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_47]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_192]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_49:%.*]] = add i32 [[TMP3]], 196
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_196]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_49]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_50:%.*]] = add i32 [[TMP3]], 200
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_200]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_50]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_51:%.*]] = add i32 [[TMP3]], 204
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_204]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_51]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_208]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_53:%.*]] = add i32 [[TMP3]], 212
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_212]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_53]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_54:%.*]] = add i32 [[TMP3]], 216
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_216]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_54]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_55:%.*]] = add i32 [[TMP3]], 220
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_220]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_55]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_224]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_57:%.*]] = add i32 [[TMP3]], 228
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_228]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_57]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_58:%.*]] = add i32 [[TMP3]], 232
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_232]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_58]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_59:%.*]] = add i32 [[TMP3]], 236
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_236]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_59]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_61:%.*]] = add i32 [[TMP3]], 244
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_244]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_61]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_62:%.*]] = add i32 [[TMP3]], 248
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_248]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_62]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_63:%.*]] = add i32 [[TMP3]], 252
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_252]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_63]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -345,12 +586,44 @@ define void @memcpy_known_i32_volatile(ptr addrspace(7) inreg %src, ptr addrspac
; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648)
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[SRC_OFF]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[SRC_OFF]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[SRC_OFF]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[DST_OFF]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[DST_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[DST_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 -2147483648)
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTOFF_05:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_06:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_05]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_433:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_433]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <4 x i32> [[DOTSLICE_06]], i32 [[DOTOFF_48]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_845:%.*]] = add i32 [[TMP2]], 8
+; CHECK-NEXT: [[DOTOFF_811:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_845]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_212:%.*]] = insertelement <4 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_811]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_1257:%.*]] = add i32 [[TMP2]], 12
+; CHECK-NEXT: [[DOTOFF_1214:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_1257]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[DOTSLICE_212]], i32 [[DOTOFF_1214]], i64 3
; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_05]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_478:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_478]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_886:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_811]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_886]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_1294:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_1214]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1294]], i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true)
@@ -371,9 +644,25 @@ define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg
; CHECK: [[LOOP_MEMCPY_EXPANSION]]:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP6]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP6]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP6]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16
; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -410,49 +699,49 @@ define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7)
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
@@ -477,63 +766,63 @@ define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1)
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
@@ -590,63 +879,63 @@ define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspa
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
@@ -684,112 +973,112 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
@@ -808,12 +1097,44 @@ define void @memcpy.inline_known_small(ptr addrspace(7) inreg %src, ptr addrspac
; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0)
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[SRC_OFF]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[SRC_OFF]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[SRC_OFF]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[DST_OFF]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[DST_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[DST_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_05:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_06:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_05]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_433:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_433]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <4 x i32> [[DOTSLICE_06]], i32 [[DOTOFF_48]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_845:%.*]] = add i32 [[TMP2]], 8
+; CHECK-NEXT: [[DOTOFF_811:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_845]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_212:%.*]] = insertelement <4 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_811]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_1257:%.*]] = add i32 [[TMP2]], 12
+; CHECK-NEXT: [[DOTOFF_1214:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_1257]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[DOTSLICE_212]], i32 [[DOTOFF_1214]], i64 3
; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_05]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_478:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_478]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_886:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_811]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_886]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_1294:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_1214]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1294]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false)
@@ -874,117 +1195,326 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ]
; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]]
-; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
-; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
-; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
-; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
-; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
-; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
-; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
-; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
-; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
-; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
-; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
-; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
-; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
-; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
-; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
-; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <64 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[TMP1]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <64 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[TMP1]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <64 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[TMP1]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_3:%.*]] = insertelement <64 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_4:%.*]] = insertelement <64 x i32> [[DOTSLICE_3]], i32 [[DOTOFF_16]], i64 4
+; CHECK-NEXT: [[DOTOFF_PTR_20:%.*]] = add i32 [[TMP1]], 20
+; CHECK-NEXT: [[DOTOFF_20:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_5:%.*]] = insertelement <64 x i32> [[DOTSLICE_4]], i32 [[DOTOFF_20]], i64 5
+; CHECK-NEXT: [[DOTOFF_PTR_24:%.*]] = add i32 [[TMP1]], 24
+; CHECK-NEXT: [[DOTOFF_24:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_6:%.*]] = insertelement <64 x i32> [[DOTSLICE_5]], i32 [[DOTOFF_24]], i64 6
+; CHECK-NEXT: [[DOTOFF_PTR_28:%.*]] = add i32 [[TMP1]], 28
+; CHECK-NEXT: [[DOTOFF_28:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_7:%.*]] = insertelement <64 x i32> [[DOTSLICE_6]], i32 [[DOTOFF_28]], i64 7
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_32:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_8:%.*]] = insertelement <64 x i32> [[DOTSLICE_7]], i32 [[DOTOFF_32]], i64 8
+; CHECK-NEXT: [[DOTOFF_PTR_36:%.*]] = add i32 [[TMP1]], 36
+; CHECK-NEXT: [[DOTOFF_36:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_9:%.*]] = insertelement <64 x i32> [[DOTSLICE_8]], i32 [[DOTOFF_36]], i64 9
+; CHECK-NEXT: [[DOTOFF_PTR_40:%.*]] = add i32 [[TMP1]], 40
+; CHECK-NEXT: [[DOTOFF_40:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_10:%.*]] = insertelement <64 x i32> [[DOTSLICE_9]], i32 [[DOTOFF_40]], i64 10
+; CHECK-NEXT: [[DOTOFF_PTR_44:%.*]] = add i32 [[TMP1]], 44
+; CHECK-NEXT: [[DOTOFF_44:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_11:%.*]] = insertelement <64 x i32> [[DOTSLICE_10]], i32 [[DOTOFF_44]], i64 11
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_12:%.*]] = insertelement <64 x i32> [[DOTSLICE_11]], i32 [[DOTOFF_48]], i64 12
+; CHECK-NEXT: [[DOTOFF_PTR_52:%.*]] = add i32 [[TMP1]], 52
+; CHECK-NEXT: [[DOTOFF_52:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_13:%.*]] = insertelement <64 x i32> [[DOTSLICE_12]], i32 [[DOTOFF_52]], i64 13
+; CHECK-NEXT: [[DOTOFF_PTR_56:%.*]] = add i32 [[TMP1]], 56
+; CHECK-NEXT: [[DOTOFF_56:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_14:%.*]] = insertelement <64 x i32> [[DOTSLICE_13]], i32 [[DOTOFF_56]], i64 14
+; CHECK-NEXT: [[DOTOFF_PTR_60:%.*]] = add i32 [[TMP1]], 60
+; CHECK-NEXT: [[DOTOFF_60:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_60]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_15:%.*]] = insertelement <64 x i32> [[DOTSLICE_14]], i32 [[DOTOFF_60]], i64 15
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_64:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_16:%.*]] = insertelement <64 x i32> [[DOTSLICE_15]], i32 [[DOTOFF_64]], i64 16
+; CHECK-NEXT: [[DOTOFF_PTR_68:%.*]] = add i32 [[TMP1]], 68
+; CHECK-NEXT: [[DOTOFF_68:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_68]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_17:%.*]] = insertelement <64 x i32> [[DOTSLICE_16]], i32 [[DOTOFF_68]], i64 17
+; CHECK-NEXT: [[DOTOFF_PTR_72:%.*]] = add i32 [[TMP1]], 72
+; CHECK-NEXT: [[DOTOFF_72:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_72]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_18:%.*]] = insertelement <64 x i32> [[DOTSLICE_17]], i32 [[DOTOFF_72]], i64 18
+; CHECK-NEXT: [[DOTOFF_PTR_76:%.*]] = add i32 [[TMP1]], 76
+; CHECK-NEXT: [[DOTOFF_76:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_76]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <64 x i32> [[DOTSLICE_18]], i32 [[DOTOFF_76]], i64 19
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_80:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_20:%.*]] = insertelement <64 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_80]], i64 20
+; CHECK-NEXT: [[DOTOFF_PTR_84:%.*]] = add i32 [[TMP1]], 84
+; CHECK-NEXT: [[DOTOFF_84:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_84]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_21:%.*]] = insertelement <64 x i32> [[DOTSLICE_20]], i32 [[DOTOFF_84]], i64 21
+; CHECK-NEXT: [[DOTOFF_PTR_88:%.*]] = add i32 [[TMP1]], 88
+; CHECK-NEXT: [[DOTOFF_88:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_88]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_22:%.*]] = insertelement <64 x i32> [[DOTSLICE_21]], i32 [[DOTOFF_88]], i64 22
+; CHECK-NEXT: [[DOTOFF_PTR_92:%.*]] = add i32 [[TMP1]], 92
+; CHECK-NEXT: [[DOTOFF_92:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_92]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_23:%.*]] = insertelement <64 x i32> [[DOTSLICE_22]], i32 [[DOTOFF_92]], i64 23
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_96:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_24:%.*]] = insertelement <64 x i32> [[DOTSLICE_23]], i32 [[DOTOFF_96]], i64 24
+; CHECK-NEXT: [[DOTOFF_PTR_100:%.*]] = add i32 [[TMP1]], 100
+; CHECK-NEXT: [[DOTOFF_100:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_100]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_25:%.*]] = insertelement <64 x i32> [[DOTSLICE_24]], i32 [[DOTOFF_100]], i64 25
+; CHECK-NEXT: [[DOTOFF_PTR_104:%.*]] = add i32 [[TMP1]], 104
+; CHECK-NEXT: [[DOTOFF_104:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_104]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_26:%.*]] = insertelement <64 x i32> [[DOTSLICE_25]], i32 [[DOTOFF_104]], i64 26
+; CHECK-NEXT: [[DOTOFF_PTR_108:%.*]] = add i32 [[TMP1]], 108
+; CHECK-NEXT: [[DOTOFF_108:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_108]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_27:%.*]] = insertelement <64 x i32> [[DOTSLICE_26]], i32 [[DOTOFF_108]], i64 27
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_112:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_28:%.*]] = insertelement <64 x i32> [[DOTSLICE_27]], i32 [[DOTOFF_112]], i64 28
+; CHECK-NEXT: [[DOTOFF_PTR_116:%.*]] = add i32 [[TMP1]], 116
+; CHECK-NEXT: [[DOTOFF_116:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_116]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_29:%.*]] = insertelement <64 x i32> [[DOTSLICE_28]], i32 [[DOTOFF_116]], i64 29
+; CHECK-NEXT: [[DOTOFF_PTR_120:%.*]] = add i32 [[TMP1]], 120
+; CHECK-NEXT: [[DOTOFF_120:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_120]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_30:%.*]] = insertelement <64 x i32> [[DOTSLICE_29]], i32 [[DOTOFF_120]], i64 30
+; CHECK-NEXT: [[DOTOFF_PTR_124:%.*]] = add i32 [[TMP1]], 124
+; CHECK-NEXT: [[DOTOFF_124:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_124]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_31:%.*]] = insertelement <64 x i32> [[DOTSLICE_30]], i32 [[DOTOFF_124]], i64 31
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_128:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_32:%.*]] = insertelement <64 x i32> [[DOTSLICE_31]], i32 [[DOTOFF_128]], i64 32
+; CHECK-NEXT: [[DOTOFF_PTR_132:%.*]] = add i32 [[TMP1]], 132
+; CHECK-NEXT: [[DOTOFF_132:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_132]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_33:%.*]] = insertelement <64 x i32> [[DOTSLICE_32]], i32 [[DOTOFF_132]], i64 33
+; CHECK-NEXT: [[DOTOFF_PTR_136:%.*]] = add i32 [[TMP1]], 136
+; CHECK-NEXT: [[DOTOFF_136:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_136]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_34:%.*]] = insertelement <64 x i32> [[DOTSLICE_33]], i32 [[DOTOFF_136]], i64 34
+; CHECK-NEXT: [[DOTOFF_PTR_140:%.*]] = add i32 [[TMP1]], 140
+; CHECK-NEXT: [[DOTOFF_140:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_140]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_35:%.*]] = insertelement <64 x i32> [[DOTSLICE_34]], i32 [[DOTOFF_140]], i64 35
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_144:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_36:%.*]] = insertelement <64 x i32> [[DOTSLICE_35]], i32 [[DOTOFF_144]], i64 36
+; CHECK-NEXT: [[DOTOFF_PTR_148:%.*]] = add i32 [[TMP1]], 148
+; CHECK-NEXT: [[DOTOFF_148:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_148]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_37:%.*]] = insertelement <64 x i32> [[DOTSLICE_36]], i32 [[DOTOFF_148]], i64 37
+; CHECK-NEXT: [[DOTOFF_PTR_152:%.*]] = add i32 [[TMP1]], 152
+; CHECK-NEXT: [[DOTOFF_152:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_152]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_38:%.*]] = insertelement <64 x i32> [[DOTSLICE_37]], i32 [[DOTOFF_152]], i64 38
+; CHECK-NEXT: [[DOTOFF_PTR_156:%.*]] = add i32 [[TMP1]], 156
+; CHECK-NEXT: [[DOTOFF_156:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_156]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_39:%.*]] = insertelement <64 x i32> [[DOTSLICE_38]], i32 [[DOTOFF_156]], i64 39
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_160:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_40:%.*]] = insertelement <64 x i32> [[DOTSLICE_39]], i32 [[DOTOFF_160]], i64 40
+; CHECK-NEXT: [[DOTOFF_PTR_164:%.*]] = add i32 [[TMP1]], 164
+; CHECK-NEXT: [[DOTOFF_164:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_164]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_41:%.*]] = insertelement <64 x i32> [[DOTSLICE_40]], i32 [[DOTOFF_164]], i64 41
+; CHECK-NEXT: [[DOTOFF_PTR_168:%.*]] = add i32 [[TMP1]], 168
+; CHECK-NEXT: [[DOTOFF_168:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_168]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_42:%.*]] = insertelement <64 x i32> [[DOTSLICE_41]], i32 [[DOTOFF_168]], i64 42
+; CHECK-NEXT: [[DOTOFF_PTR_172:%.*]] = add i32 [[TMP1]], 172
+; CHECK-NEXT: [[DOTOFF_172:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_172]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_43:%.*]] = insertelement <64 x i32> [[DOTSLICE_42]], i32 [[DOTOFF_172]], i64 43
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_176:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_44:%.*]] = insertelement <64 x i32> [[DOTSLICE_43]], i32 [[DOTOFF_176]], i64 44
+; CHECK-NEXT: [[DOTOFF_PTR_180:%.*]] = add i32 [[TMP1]], 180
+; CHECK-NEXT: [[DOTOFF_180:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_180]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_45:%.*]] = insertelement <64 x i32> [[DOTSLICE_44]], i32 [[DOTOFF_180]], i64 45
+; CHECK-NEXT: [[DOTOFF_PTR_184:%.*]] = add i32 [[TMP1]], 184
+; CHECK-NEXT: [[DOTOFF_184:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_184]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_46:%.*]] = insertelement <64 x i32> [[DOTSLICE_45]], i32 [[DOTOFF_184]], i64 46
+; CHECK-NEXT: [[DOTOFF_PTR_188:%.*]] = add i32 [[TMP1]], 188
+; CHECK-NEXT: [[DOTOFF_188:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_188]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_47:%.*]] = insertelement <64 x i32> [[DOTSLICE_46]], i32 [[DOTOFF_188]], i64 47
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_192:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_48:%.*]] = insertelement <64 x i32> [[DOTSLICE_47]], i32 [[DOTOFF_192]], i64 48
+; CHECK-NEXT: [[DOTOFF_PTR_196:%.*]] = add i32 [[TMP1]], 196
+; CHECK-NEXT: [[DOTOFF_196:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_196]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_49:%.*]] = insertelement <64 x i32> [[DOTSLICE_48]], i32 [[DOTOFF_196]], i64 49
+; CHECK-NEXT: [[DOTOFF_PTR_200:%.*]] = add i32 [[TMP1]], 200
+; CHECK-NEXT: [[DOTOFF_200:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_200]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_50:%.*]] = insertelement <64 x i32> [[DOTSLICE_49]], i32 [[DOTOFF_200]], i64 50
+; CHECK-NEXT: [[DOTOFF_PTR_204:%.*]] = add i32 [[TMP1]], 204
+; CHECK-NEXT: [[DOTOFF_204:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_204]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_51:%.*]] = insertelement <64 x i32> [[DOTSLICE_50]], i32 [[DOTOFF_204]], i64 51
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_208:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_52:%.*]] = insertelement <64 x i32> [[DOTSLICE_51]], i32 [[DOTOFF_208]], i64 52
+; CHECK-NEXT: [[DOTOFF_PTR_212:%.*]] = add i32 [[TMP1]], 212
+; CHECK-NEXT: [[DOTOFF_212:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_212]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_53:%.*]] = insertelement <64 x i32> [[DOTSLICE_52]], i32 [[DOTOFF_212]], i64 53
+; CHECK-NEXT: [[DOTOFF_PTR_216:%.*]] = add i32 [[TMP1]], 216
+; CHECK-NEXT: [[DOTOFF_216:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_216]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_54:%.*]] = insertelement <64 x i32> [[DOTSLICE_53]], i32 [[DOTOFF_216]], i64 54
+; CHECK-NEXT: [[DOTOFF_PTR_220:%.*]] = add i32 [[TMP1]], 220
+; CHECK-NEXT: [[DOTOFF_220:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_220]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_55:%.*]] = insertelement <64 x i32> [[DOTSLICE_54]], i32 [[DOTOFF_220]], i64 55
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_224:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_56:%.*]] = insertelement <64 x i32> [[DOTSLICE_55]], i32 [[DOTOFF_224]], i64 56
+; CHECK-NEXT: [[DOTOFF_PTR_228:%.*]] = add i32 [[TMP1]], 228
+; CHECK-NEXT: [[DOTOFF_228:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_228]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_57:%.*]] = insertelement <64 x i32> [[DOTSLICE_56]], i32 [[DOTOFF_228]], i64 57
+; CHECK-NEXT: [[DOTOFF_PTR_232:%.*]] = add i32 [[TMP1]], 232
+; CHECK-NEXT: [[DOTOFF_232:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_232]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_58:%.*]] = insertelement <64 x i32> [[DOTSLICE_57]], i32 [[DOTOFF_232]], i64 58
+; CHECK-NEXT: [[DOTOFF_PTR_236:%.*]] = add i32 [[TMP1]], 236
+; CHECK-NEXT: [[DOTOFF_236:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_236]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_59:%.*]] = insertelement <64 x i32> [[DOTSLICE_58]], i32 [[DOTOFF_236]], i64 59
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_240:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_60:%.*]] = insertelement <64 x i32> [[DOTSLICE_59]], i32 [[DOTOFF_240]], i64 60
+; CHECK-NEXT: [[DOTOFF_PTR_244:%.*]] = add i32 [[TMP1]], 244
+; CHECK-NEXT: [[DOTOFF_244:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_244]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_61:%.*]] = insertelement <64 x i32> [[DOTSLICE_60]], i32 [[DOTOFF_244]], i64 61
+; CHECK-NEXT: [[DOTOFF_PTR_248:%.*]] = add i32 [[TMP1]], 248
+; CHECK-NEXT: [[DOTOFF_248:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_248]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_62:%.*]] = insertelement <64 x i32> [[DOTSLICE_61]], i32 [[DOTOFF_248]], i64 62
+; CHECK-NEXT: [[DOTOFF_PTR_252:%.*]] = add i32 [[TMP1]], 252
+; CHECK-NEXT: [[DOTOFF_252:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_252]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i32> [[DOTSLICE_62]], i32 [[DOTOFF_252]], i64 63
; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]]
-; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
-; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
-; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
-; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
-; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
-; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
-; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
-; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
-; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
-; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
-; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
-; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
-; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
-; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
-; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_1:%.*]] = add i32 [[TMP3]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_2:%.*]] = add i32 [[TMP3]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_3:%.*]] = add i32 [[TMP3]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_3]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_5:%.*]] = add i32 [[TMP3]], 20
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_5]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_6:%.*]] = add i32 [[TMP3]], 24
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_6]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_7:%.*]] = add i32 [[TMP3]], 28
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_7]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_9:%.*]] = add i32 [[TMP3]], 36
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_9]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_10:%.*]] = add i32 [[TMP3]], 40
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_10]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_11:%.*]] = add i32 [[TMP3]], 44
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_11]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_13:%.*]] = add i32 [[TMP3]], 52
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_13]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_14:%.*]] = add i32 [[TMP3]], 56
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_14]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_15:%.*]] = add i32 [[TMP3]], 60
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_15]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_64]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_17:%.*]] = add i32 [[TMP3]], 68
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_68]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_17]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_18:%.*]] = add i32 [[TMP3]], 72
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_72]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_18]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_19:%.*]] = add i32 [[TMP3]], 76
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_76]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_19]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_80]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_21:%.*]] = add i32 [[TMP3]], 84
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_84]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_21]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_22:%.*]] = add i32 [[TMP3]], 88
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_88]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_22]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_23:%.*]] = add i32 [[TMP3]], 92
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_92]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_23]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_96]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_25:%.*]] = add i32 [[TMP3]], 100
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_100]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_25]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_26:%.*]] = add i32 [[TMP3]], 104
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_104]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_26]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_27:%.*]] = add i32 [[TMP3]], 108
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_108]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_27]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_112]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_29:%.*]] = add i32 [[TMP3]], 116
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_116]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_29]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_30:%.*]] = add i32 [[TMP3]], 120
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_120]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_30]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_31:%.*]] = add i32 [[TMP3]], 124
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_124]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_31]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_128]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_33:%.*]] = add i32 [[TMP3]], 132
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_132]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_33]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_34:%.*]] = add i32 [[TMP3]], 136
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_136]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_34]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_35:%.*]] = add i32 [[TMP3]], 140
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_140]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_35]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_144]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_37:%.*]] = add i32 [[TMP3]], 148
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_148]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_37]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_38:%.*]] = add i32 [[TMP3]], 152
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_152]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_38]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_39:%.*]] = add i32 [[TMP3]], 156
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_156]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_39]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_160]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_41:%.*]] = add i32 [[TMP3]], 164
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_164]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_41]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_42:%.*]] = add i32 [[TMP3]], 168
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_168]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_42]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_43:%.*]] = add i32 [[TMP3]], 172
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_172]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_43]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_176]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_45:%.*]] = add i32 [[TMP3]], 180
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_180]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_45]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_46:%.*]] = add i32 [[TMP3]], 184
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_184]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_46]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_47:%.*]] = add i32 [[TMP3]], 188
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_188]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_47]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_192]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_49:%.*]] = add i32 [[TMP3]], 196
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_196]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_49]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_50:%.*]] = add i32 [[TMP3]], 200
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_200]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_50]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_51:%.*]] = add i32 [[TMP3]], 204
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_204]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_51]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_208]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_53:%.*]] = add i32 [[TMP3]], 212
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_212]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_53]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_54:%.*]] = add i32 [[TMP3]], 216
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_216]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_54]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_55:%.*]] = add i32 [[TMP3]], 220
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_220]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_55]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_224]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_57:%.*]] = add i32 [[TMP3]], 228
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_228]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_57]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_58:%.*]] = add i32 [[TMP3]], 232
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_232]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_58]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_59:%.*]] = add i32 [[TMP3]], 236
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_236]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_59]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_61:%.*]] = add i32 [[TMP3]], 244
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_244]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_61]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_62:%.*]] = add i32 [[TMP3]], 248
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_248]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_62]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_63:%.*]] = add i32 [[TMP3]], 252
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_252]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_63]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -1002,12 +1532,44 @@ define void @memcpy.inline_known_i32_volatile(ptr addrspace(7) inreg %src, ptr a
; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1
; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0
; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648)
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[SRC_OFF]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[SRC_OFF]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[SRC_OFF]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[DST_OFF]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[DST_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[DST_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 -2147483648)
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTOFF_05:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_06:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_05]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_433:%.*]] = add i32 [[TMP2]], 4
+; CHECK-NEXT: [[DOTOFF_48:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_433]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_19:%.*]] = insertelement <4 x i32> [[DOTSLICE_06]], i32 [[DOTOFF_48]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_845:%.*]] = add i32 [[TMP2]], 8
+; CHECK-NEXT: [[DOTOFF_811:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_845]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTSLICE_212:%.*]] = insertelement <4 x i32> [[DOTSLICE_19]], i32 [[DOTOFF_811]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_1257:%.*]] = add i32 [[TMP2]], 12
+; CHECK-NEXT: [[DOTOFF_1214:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_1257]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[DOTSLICE_212]], i32 [[DOTOFF_1214]], i64 3
; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_05]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_478:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_478]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_886:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_811]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_886]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DOTPART_1294:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_1214]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_1294]], i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true)
@@ -1028,9 +1590,25 @@ define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7)
; CHECK: [[LOOP_MEMCPY_EXPANSION]]:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTOFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_0:%.*]] = insertelement <4 x i32> poison, i32 [[DOTOFF_0]], i64 0
+; CHECK-NEXT: [[DOTOFF_PTR_4:%.*]] = add i32 [[TMP4]], 4
+; CHECK-NEXT: [[DOTOFF_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_1:%.*]] = insertelement <4 x i32> [[DOTSLICE_0]], i32 [[DOTOFF_4]], i64 1
+; CHECK-NEXT: [[DOTOFF_PTR_8:%.*]] = add i32 [[TMP4]], 8
+; CHECK-NEXT: [[DOTOFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTSLICE_2:%.*]] = insertelement <4 x i32> [[DOTSLICE_1]], i32 [[DOTOFF_8]], i64 2
+; CHECK-NEXT: [[DOTOFF_PTR_12:%.*]] = add i32 [[TMP4]], 12
+; CHECK-NEXT: [[DOTOFF_12:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_12]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[DOTSLICE_2]], i32 [[DOTOFF_12]], i64 3
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP6]], 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP6]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP6]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DOTOFF_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16
; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1067,49 +1645,49 @@ define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrs
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]]
; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add i32 [[TMP3]], 16
; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add i32 [[TMP3]], 32
; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add i32 [[TMP3]], 48
; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add i32 [[TMP3]], 64
; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add i32 [[TMP3]], 80
; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add i32 [[TMP3]], 96
; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add i32 [[TMP3]], 112
; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add i32 [[TMP3]], 128
; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add i32 [[TMP3]], 144
; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add i32 [[TMP3]], 160
; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add i32 [[TMP3]], 176
; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add i32 [[TMP3]], 192
; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add i32 [[TMP3]], 208
; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add i32 [[TMP3]], 224
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
-; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
@@ -1134,63 +1712,63 @@ define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrs
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
@@ -1247,63 +1825,63 @@ define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr
; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16
+; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add i32 [[TMP1]], 16
; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32
+; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add i32 [[TMP1]], 32
; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48
+; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add i32 [[TMP1]], 48
; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64
+; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add i32 [[TMP1]], 64
; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80
+; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add i32 [[TMP1]], 80
; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 64, i32 65, i32 66, i32 67, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96
+; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add i32 [[TMP1]], 96
; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112
+; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add i32 [[TMP1]], 112
; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128
+; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add i32 [[TMP1]], 128
; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144
+; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add i32 [[TMP1]], 144
; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 64, i32 65, i32 66, i32 67, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160
+; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add i32 [[TMP1]], 160
; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176
+; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add i32 [[TMP1]], 176
; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 64, i32 65, i32 66, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192
+; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add i32 [[TMP1]], 192
; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208
+; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add i32 [[TMP1]], 208
; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 64, i32 65, i32 66, i32 67, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224
+; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add i32 [[TMP1]], 224
; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240
+; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add i32 [[TMP1]], 240
; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0)
; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67>
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index ef3026356f5fe..e83713eb267f5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -51,7 +51,7 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]]
; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
-; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
; CHECK-NEXT: [[BUF_PTR_4_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_LEGAL]], i64 4, !dbg [[DBG33]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
; CHECK-NEXT: ret float [[RET]], !dbg [[DBG34:![0-9]+]]
More information about the llvm-commits
mailing list