[llvm] 3805355 - [AMDGPU] Handle natively unsupported types in addrspace(7) lowering (#110572)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 09:33:41 PST 2025
Author: Krzysztof Drewniak
Date: 2025-01-20T11:33:35-06:00
New Revision: 3805355ef69a33fc6b32e4a4de0ad3ef22584c65
URL: https://github.com/llvm/llvm-project/commit/3805355ef69a33fc6b32e4a4de0ad3ef22584c65
DIFF: https://github.com/llvm/llvm-project/commit/3805355ef69a33fc6b32e4a4de0ad3ef22584c65.diff
LOG: [AMDGPU] Handle natively unsupported types in addrspace(7) lowering (#110572)
The current lowering for ptr addrspace(7) assumed that the instruction
selector can handle arbtrary LLVM types, which is not the case. Code
generation can't deal with
- Values that aren't 8, 16, 32, 64, 96, or 128 bits long
- Aggregates (this commit only handles arrays of scalars, more may come)
- Vectors of more than one byte
- 3-word values that aren't a vector of 3 32-bit values (for axample, a
<6 x half>)
This commit adds a buffer contents type legalizer that adds the needed
bitcasts, zero-extensions, and splits into subcompnents needed to
convert a load or store operation into one that can be successfully
lowered through code generation.
In the long run, some of the involved bitcasts (though potentially not
the buffer operation splitting) ought to be handled by the instruction
legalizer, but SelectionDAG makes this difficult.
It also takes advantage of the new `nuw` flag on `getelementptr` when
lowering GEPs to offset additions.
We don't currently plumb through `nsw` on GEPs since that should likely
be a separate change and would require declaring what we mean by "the
address" in the context of the GEP guarantees.
Added:
llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 657a406e9f7056..75a0c47f7c2773 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -66,6 +66,28 @@
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
// hardware does not include a 160-bit atomic.
//
+// ## Buffer contents type legalization
+//
+// The underlying buffer intrinsics only support types up to 128 bits long,
+// and don't support complex types. If buffer operations were
+// standard pointer operations that could be represented as MIR-level loads,
+// this would be handled by the various legalization schemes in instruction
+// selection. However, because we have to do the conversion from `load` and
+// `store` to intrinsics at LLVM IR level, we must perform that legalization
+// ourselves.
+//
+// This involves a combination of
+// - Converting arrays to vectors where possible
+// - Otherwise, splitting loads and stores of aggregates into loads/stores of
+// each component.
+// - Zero-extending things to fill a whole number of bytes
+// - Casting values of types that don't neatly correspond to supported machine
+// value
+// (for example, an i96 or i256) into ones that would work (
+// like <3 x i32> and <8 x i32>, respectively)
+// - Splitting values that are too long (such as aforementioned <8 x i32>) into
+// multiple operations.
+//
// ## Type remapping
//
// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
@@ -86,7 +108,6 @@
// This phase also records intrinsics so that they can be remangled or deleted
// later.
//
-//
// ## Splitting pointer structs
//
// The meat of this pass consists of defining semantics for operations that
@@ -218,6 +239,7 @@
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -551,7 +573,6 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
auto *NLI = cast<LoadInst>(LI.clone());
NLI->mutateType(IntTy);
NLI = IRB.Insert(NLI);
- copyMetadataForLoad(*NLI, LI);
NLI->takeName(&LI);
Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
@@ -576,6 +597,540 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
return true;
}
+namespace {
+/// Convert loads/stores of types that the buffer intrinsics can't handle into
+/// one ore more such loads/stores that consist of legal types.
+///
+/// Do this by
+/// 1. Recursing into structs (and arrays that don't share a memory layout with
+/// vectors) since the intrinsics can't handle complex types.
+/// 2. Converting arrays of non-aggregate, byte-sized types into their
+/// corresponding vectors
+/// 3. Bitcasting unsupported types, namely overly-long scalars and byte
+/// vectors, into vectors of supported types.
+/// 4. Splitting up excessively long reads/writes into multiple operations.
+///
+/// Note that this doesn't handle complex data strucures, but, in the future,
+/// the aggregate load splitter from SROA could be refactored to allow for that
+/// case.
+class LegalizeBufferContentTypesVisitor
+ : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
+ friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
+
+ IRBuilder<> IRB;
+
+ const DataLayout &DL;
+
+ /// If T is [N x U], where U is a scalar type, return the vector type
+ /// <N x U>, otherwise, return T.
+ Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
+ Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name);
+ Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name);
+
+ /// Break up the loads of a struct into the loads of its components
+
+ /// Convert a vector or scalar type that can't be operated on by buffer
+ /// intrinsics to one that would be legal through bitcasts and/or truncation.
+ /// Uses the wider of i32, i16, or i8 where possible.
+ Type *legalNonAggregateFor(Type *T);
+ Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
+ Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
+
+ struct VecSlice {
+ uint64_t Index = 0;
+ uint64_t Length = 0;
+ VecSlice() = delete;
+ };
+ /// Return the [index, length] pairs into which `T` needs to be cut to form
+ /// legal buffer load or store operations. Clears `Slices`. Creates an empty
+ /// `Slices` for non-vector inputs and creates one slice if no slicing will be
+ /// needed.
+ void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+
+ Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
+ Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
+
+ /// In most cases, return `LegalType`. However, when given an input that would
+ /// normally be a legal type for the buffer intrinsics to return but that
+ /// isn't hooked up through SelectionDAG, return a type of the same width that
+ /// can be used with the relevant intrinsics. Specifically, handle the cases:
+ /// - <1 x T> => T for all T
+ /// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+ /// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+ /// i32>
+ Type *intrinsicTypeFor(Type *LegalType);
+
+ bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
+ SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
+ Value *&Result, const Twine &Name);
+ /// Return value is (Changed, ModifiedInPlace)
+ std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
+ SmallVectorImpl<uint32_t> &AggIdxs,
+ uint64_t AggByteOffset,
+ const Twine &Name);
+
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitLoadInst(LoadInst &LI);
+ bool visitStoreInst(StoreInst &SI);
+
+public:
+ LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
+ : IRB(Ctx), DL(DL) {}
+ bool processFunction(Function &F);
+};
+} // namespace
+
+Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
+ ArrayType *AT = dyn_cast<ArrayType>(T);
+ if (!AT)
+ return T;
+ Type *ET = AT->getElementType();
+ if (!ET->isSingleValueType() || isa<VectorType>(ET))
+ report_fatal_error("loading non-scalar arrays from buffer fat pointers "
+ "should have recursed");
+ if (!DL.typeSizeEqualsStoreSize(AT))
+ report_fatal_error(
+ "loading padded arrays from buffer fat pinters should have recursed");
+ return FixedVectorType::get(ET, AT->getNumElements());
+}
+
+Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
+ Type *TargetType,
+ const Twine &Name) {
+ Value *VectorRes = PoisonValue::get(TargetType);
+ auto *VT = cast<FixedVectorType>(TargetType);
+ unsigned EC = VT->getNumElements();
+ for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+ Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
+ VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
+ Name + ".as.vec." + Twine(I));
+ }
+ return VectorRes;
+}
+
+Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
+ Type *OrigType,
+ const Twine &Name) {
+ Value *ArrayRes = PoisonValue::get(OrigType);
+ ArrayType *AT = cast<ArrayType>(OrigType);
+ unsigned EC = AT->getNumElements();
+ for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+ Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
+ ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
+ Name + ".as.array." + Twine(I));
+ }
+ return ArrayRes;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+ TypeSize Size = DL.getTypeStoreSizeInBits(T);
+ // Implicitly zero-extend to the next byte if needed
+ if (!DL.typeSizeEqualsStoreSize(T))
+ T = IRB.getIntNTy(Size.getFixedValue());
+ Type *ElemTy = T->getScalarType();
+ if (isa<PointerType, ScalableVectorType>(ElemTy)) {
+ // Pointers are always big enough, and we'll let scalable vectors through to
+ // fail in codegen.
+ return T;
+ }
+ unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
+ if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+ // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
+ // legal buffer operations.
+ return T;
+ }
+ Type *BestVectorElemType = nullptr;
+ if (Size.isKnownMultipleOf(32))
+ BestVectorElemType = IRB.getInt32Ty();
+ else if (Size.isKnownMultipleOf(16))
+ BestVectorElemType = IRB.getInt16Ty();
+ else
+ BestVectorElemType = IRB.getInt8Ty();
+ unsigned NumCastElems =
+ Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
+ if (NumCastElems == 1)
+ return BestVectorElemType;
+ return FixedVectorType::get(BestVectorElemType, NumCastElems);
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
+ Value *V, Type *TargetType, const Twine &Name) {
+ Type *SourceType = V->getType();
+ TypeSize SourceSize = DL.getTypeSizeInBits(SourceType);
+ TypeSize TargetSize = DL.getTypeSizeInBits(TargetType);
+ if (SourceSize != TargetSize) {
+ Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue());
+ Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue());
+ Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
+ Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
+ V = Zext;
+ SourceType = ByteScalarTy;
+ }
+ return IRB.CreateBitCast(V, TargetType, Name + ".legal");
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
+ Value *V, Type *OrigType, const Twine &Name) {
+ Type *LegalType = V->getType();
+ TypeSize LegalSize = DL.getTypeSizeInBits(LegalType);
+ TypeSize OrigSize = DL.getTypeSizeInBits(OrigType);
+ if (LegalSize != OrigSize) {
+ Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue());
+ Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue());
+ Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
+ Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
+ return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
+ }
+ return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
+}
+
+Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
+ auto *VT = dyn_cast<FixedVectorType>(LegalType);
+ if (!VT)
+ return LegalType;
+ Type *ET = VT->getElementType();
+ // Explicitly return the element type of 1-element vectors because the
+ // underlying intrinsics don't like <1 x T> even though it's a synonym for T.
+ if (VT->getNumElements() == 1)
+ return ET;
+ if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
+ return FixedVectorType::get(IRB.getInt32Ty(), 3);
+ if (ET->isIntegerTy(8)) {
+ switch (VT->getNumElements()) {
+ default:
+ return LegalType; // Let it crash later
+ case 1:
+ return IRB.getInt8Ty();
+ case 2:
+ return IRB.getInt16Ty();
+ case 4:
+ return IRB.getInt32Ty();
+ case 8:
+ return FixedVectorType::get(IRB.getInt32Ty(), 2);
+ case 16:
+ return FixedVectorType::get(IRB.getInt32Ty(), 4);
+ }
+ }
+ return LegalType;
+}
+
+void LegalizeBufferContentTypesVisitor::getVecSlices(
+ Type *T, SmallVectorImpl<VecSlice> &Slices) {
+ Slices.clear();
+ auto *VT = dyn_cast<FixedVectorType>(T);
+ if (!VT)
+ return;
+
+ uint64_t ElemBitWidth =
+ DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
+
+ uint64_t ElemsPer4Words = 128 / ElemBitWidth;
+ uint64_t ElemsPer2Words = ElemsPer4Words / 2;
+ uint64_t ElemsPerWord = ElemsPer2Words / 2;
+ uint64_t ElemsPerShort = ElemsPerWord / 2;
+ uint64_t ElemsPerByte = ElemsPerShort / 2;
+ // If the elements evenly pack into 32-bit words, we can use 3-word stores,
+ // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
+ // example, <3 x i64>, since that's not slicing.
+ uint64_t ElemsPer3Words = ElemsPerWord * 3;
+
+ uint64_t TotalElems = VT->getNumElements();
+ uint64_t Index = 0;
+ auto TrySlice = [&](unsigned MaybeLen) {
+ if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+ VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
+ Slices.push_back(Slice);
+ Index += MaybeLen;
+ return true;
+ }
+ return false;
+ };
+ while (Index < TotalElems) {
+ TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
+ TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
+ TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+ }
+}
+
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
+ const Twine &Name) {
+ auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecVT)
+ return Vec;
+ if (S.Length == VecVT->getNumElements() && S.Index == 0)
+ return Vec;
+ if (S.Length == 1)
+ return IRB.CreateExtractElement(Vec, S.Index,
+ Name + ".slice." + Twine(S.Index));
+ SmallVector<int> Mask = llvm::to_vector(
+ llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false));
+ return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index));
+}
+
+Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
+ VecSlice S,
+ const Twine &Name) {
+ auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType());
+ if (!WholeVT)
+ return Part;
+ if (S.Length == WholeVT->getNumElements() && S.Index == 0)
+ return Part;
+ if (S.Length == 1) {
+ return IRB.CreateInsertElement(Whole, Part, S.Index,
+ Name + ".slice." + Twine(S.Index));
+ }
+ int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
+
+ // Extend the slice with poisons to make the main shufflevector happy.
+ SmallVector<int> ExtPartMask(NumElems, -1);
+ for (auto [I, E] : llvm::enumerate(
+ MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
+ E = I;
+ }
+ Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
+ Name + ".ext." + Twine(S.Index));
+
+ SmallVector<int> Mask =
+ llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
+ for (auto [I, E] :
+ llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length)))
+ E = I + NumElems;
+ return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
+ Name + ".parts." + Twine(S.Index));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
+ LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+ uint64_t AggByteOff, Value *&Result, const Twine &Name) {
+ if (auto *ST = dyn_cast<StructType>(PartType)) {
+ const StructLayout *Layout = DL.getStructLayout(ST);
+ bool Changed = false;
+ for (auto [I, ElemTy, Offset] :
+ llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+ AggIdxs.push_back(I);
+ Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+ AggByteOff + Offset.getFixedValue(), Result,
+ Name + "." + Twine(I));
+ AggIdxs.pop_back();
+ }
+ return Changed;
+ }
+ if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+ Type *ElemTy = AT->getElementType();
+ if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+ ElemTy->isVectorTy()) {
+ TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+ bool Changed = false;
+ for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+ /*Inclusive=*/false)) {
+ AggIdxs.push_back(I);
+ Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+ AggByteOff + I * ElemStoreSize.getFixedValue(),
+ Result, Name + Twine(I));
+ AggIdxs.pop_back();
+ }
+ return Changed;
+ }
+ }
+
+ // Typical case
+
+ Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+
+ SmallVector<VecSlice> Slices;
+ getVecSlices(LegalType, Slices);
+ bool HasSlices = Slices.size() > 1;
+ bool IsAggPart = !AggIdxs.empty();
+ Value *LoadsRes;
+ if (!HasSlices && !IsAggPart) {
+ Type *LoadableType = intrinsicTypeFor(LegalType);
+ if (LoadableType == PartType)
+ return false;
+
+ IRB.SetInsertPoint(&OrigLI);
+ auto *NLI = cast<LoadInst>(OrigLI.clone());
+ NLI->mutateType(LoadableType);
+ NLI = IRB.Insert(NLI);
+ NLI->setName(Name + ".loadable");
+
+ LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable");
+ } else {
+ IRB.SetInsertPoint(&OrigLI);
+ LoadsRes = PoisonValue::get(LegalType);
+ Value *OrigPtr = OrigLI.getPointerOperand();
+ // If we're needing to spill something into more than one load, its legal
+ // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
+ // But if we're already a scalar (which can happen if we're splitting up a
+ // struct), the element type will be the legal type itself.
+ Type *ElemType = LegalType->getScalarType();
+ unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+ AAMDNodes AANodes = OrigLI.getAAMetadata();
+ if (IsAggPart && Slices.empty())
+ Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+ for (VecSlice S : Slices) {
+ Type *SliceType =
+ S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+ int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+ // You can't reasonably expect loads to wrap around the edge of memory.
+ Value *NewPtr = IRB.CreateGEP(
+ IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
+ GEPNoWrapFlags::noUnsignedWrap());
+ Type *LoadableType = intrinsicTypeFor(SliceType);
+ LoadInst *NewLI = IRB.CreateAlignedLoad(
+ LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
+ Name + ".off." + Twine(ByteOffset));
+ copyMetadataForLoad(*NewLI, OrigLI);
+ NewLI->setAAMetadata(
+ AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
+ NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
+ NewLI->setVolatile(OrigLI.isVolatile());
+ Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
+ NewLI->getName() + ".from.loadable");
+ LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
+ }
+ }
+ if (LegalType != ArrayAsVecType)
+ LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
+ if (ArrayAsVecType != PartType)
+ LoadsRes = vectorToArray(LoadsRes, PartType, Name);
+
+ if (IsAggPart)
+ Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name);
+ else
+ Result = LoadsRes;
+ return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+ if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+
+ SmallVector<uint32_t> AggIdxs;
+ Type *OrigType = LI.getType();
+ Value *Result = PoisonValue::get(OrigType);
+ bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName());
+ if (!Changed)
+ return false;
+ Result->takeName(&LI);
+ LI.replaceAllUsesWith(Result);
+ LI.eraseFromParent();
+ return Changed;
+}
+
+std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
+ StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+ uint64_t AggByteOff, const Twine &Name) {
+ if (auto *ST = dyn_cast<StructType>(PartType)) {
+ const StructLayout *Layout = DL.getStructLayout(ST);
+ bool Changed = false;
+ for (auto [I, ElemTy, Offset] :
+ llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+ AggIdxs.push_back(I);
+ Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs,
+ AggByteOff + Offset.getFixedValue(),
+ Name + "." + Twine(I)));
+ AggIdxs.pop_back();
+ }
+ return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+ }
+ if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+ Type *ElemTy = AT->getElementType();
+ if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+ ElemTy->isVectorTy()) {
+ TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+ bool Changed = false;
+ for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+ /*Inclusive=*/false)) {
+ AggIdxs.push_back(I);
+ Changed |= std::get<0>(visitStoreImpl(
+ OrigSI, ElemTy, AggIdxs,
+ AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I)));
+ AggIdxs.pop_back();
+ }
+ return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+ }
+ }
+
+ Value *OrigData = OrigSI.getValueOperand();
+ Value *NewData = OrigData;
+
+ bool IsAggPart = !AggIdxs.empty();
+ if (IsAggPart)
+ NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name);
+
+ Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+ if (ArrayAsVecType != PartType) {
+ NewData = arrayToVector(NewData, ArrayAsVecType, Name);
+ }
+
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ if (LegalType != ArrayAsVecType) {
+ NewData = makeLegalNonAggregate(NewData, LegalType, Name);
+ }
+
+ SmallVector<VecSlice> Slices;
+ getVecSlices(LegalType, Slices);
+ bool NeedToSplit = Slices.size() > 1 || IsAggPart;
+ if (!NeedToSplit) {
+ Type *StorableType = intrinsicTypeFor(LegalType);
+ if (StorableType == PartType)
+ return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false);
+ NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+ OrigSI.setOperand(0, NewData);
+ return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true);
+ }
+
+ Value *OrigPtr = OrigSI.getPointerOperand();
+ Type *ElemType = LegalType->getScalarType();
+ if (IsAggPart && Slices.empty())
+ Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+ unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+ AAMDNodes AANodes = OrigSI.getAAMetadata();
+ for (VecSlice S : Slices) {
+ Type *SliceType =
+ S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+ int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+ Value *NewPtr =
+ IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".part." + Twine(S.Index),
+ GEPNoWrapFlags::noUnsignedWrap());
+ Value *DataSlice = extractSlice(NewData, S, Name);
+ Type *StorableType = intrinsicTypeFor(SliceType);
+ DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+ DataSlice->getName() + ".storable");
+ auto *NewSI = cast<StoreInst>(OrigSI.clone());
+ NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset));
+ IRB.Insert(NewSI);
+ NewSI->setOperand(0, DataSlice);
+ NewSI->setOperand(1, NewPtr);
+ NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
+ }
+ return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false);
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+ if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ IRB.SetInsertPoint(&SI);
+ SmallVector<uint32_t> AggIdxs;
+ Value *OrigData = SI.getValueOperand();
+ auto [Changed, ModifiedInPlace] =
+ visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName());
+ if (Changed && !ModifiedInPlace)
+ SI.eraseFromParent();
+ return Changed;
+}
+
+bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+ bool Changed = false;
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
+ Changed |= visit(I);
+ }
+ return Changed;
+}
+
/// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
/// buffer fat pointer constant.
static std::pair<Constant *, Constant *>
@@ -1766,12 +2321,16 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
}
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+ LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
+ M.getContext());
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
bool BodyChanges = containsBufferFatPointers(F, &StructTM);
Changed |= MemOpsRewrite.processFunction(F);
- if (InterfaceChange || BodyChanges)
+ if (InterfaceChange || BodyChanges) {
NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
+ Changed |= BufferContentsTypeRewrite.processFunction(F);
+ }
}
if (NeedsRemap.empty())
return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
new file mode 100644
index 00000000000000..4c7a4ba3a44a5f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -0,0 +1,3998 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
+; the lowering.
+
+;;; Legal types. These are natively supported, no casts should be performed.
+
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i8, ptr addrspace(7) %p
+ ret i8 %ret
+}
+
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i8 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i16, ptr addrspace(7) %p
+ ret i16 %ret
+}
+
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i16 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i32, ptr addrspace(7) %p
+ ret i32 %ret
+}
+
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i32 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i64, ptr addrspace(7) %p
+ ret i64 %ret
+}
+
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i64 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i128:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i128:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i128, ptr addrspace(7) %p
+ ret i128 %ret
+}
+
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i128:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i128:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i128 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i32>, ptr addrspace(7) %p
+ ret <1 x i32> %ret
+}
+
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i32>, ptr addrspace(7) %p
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i32>, ptr addrspace(7) %p
+ ret <3 x i32> %ret
+}
+
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i32>, ptr addrspace(7) %p
+ ret <4 x i32> %ret
+}
+
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i16>, ptr addrspace(7) %p
+ ret <2 x i16> %ret
+}
+
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i16>, ptr addrspace(7) %p
+ ret <4 x i16> %ret
+}
+
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i16>, ptr addrspace(7) %p
+ ret <8 x i16> %ret
+}
+
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i64>, ptr addrspace(7) %p
+ ret <2 x i64> %ret
+}
+
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i64> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define half @load_f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load half, ptr addrspace(7) %p
+ ret half %ret
+}
+
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store half %data, ptr addrspace(7) %p
+ ret void
+}
+
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load bfloat, ptr addrspace(7) %p
+ ret bfloat %ret
+}
+
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store bfloat %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x half>, ptr addrspace(7) %p
+ ret <2 x half> %ret
+}
+
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x bfloat>, ptr addrspace(7) %p
+ ret <4 x bfloat> %ret
+}
+
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GISEL-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x bfloat> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x half>, ptr addrspace(7) %p
+ ret <8 x half> %ret
+}
+
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define float @load_f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load float, ptr addrspace(7) %p
+ ret float %ret
+}
+
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store float %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x float>, ptr addrspace(7) %p
+ ret <2 x float> %ret
+}
+
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x float>, ptr addrspace(7) %p
+ ret <3 x float> %ret
+}
+
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x float>, ptr addrspace(7) %p
+ ret <4 x float> %ret
+}
+
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(0), ptr addrspace(7) %p
+ ret ptr addrspace(0) %ret
+}
+
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(0) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(1), ptr addrspace(7) %p
+ ret ptr addrspace(1) %ret
+}
+
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(1) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(2), ptr addrspace(7) %p
+ ret ptr addrspace(2) %ret
+}
+
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(2) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(3), ptr addrspace(7) %p
+ ret ptr addrspace(3) %ret
+}
+
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(3) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(4), ptr addrspace(7) %p
+ ret ptr addrspace(4) %ret
+}
+
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(4) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(5), ptr addrspace(7) %p
+ ret ptr addrspace(5) %ret
+}
+
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(5) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(6), ptr addrspace(7) %p
+ ret ptr addrspace(6) %ret
+}
+
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(6) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(8), ptr addrspace(7) %p
+ ret ptr addrspace(8) %ret
+}
+
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(8) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
+ ret <2 x ptr addrspace(1)> %ret
+}
+
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <2 x ptr addrspace(5)> %ret
+}
+
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <3 x ptr addrspace(5)> %ret
+}
+
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <4 x ptr addrspace(5)> %ret
+}
+
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
+
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x half>, ptr addrspace(7) %p
+ ret <6 x half> %ret
+}
+
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Long types (32 bit elements). Must be split into multiple operations.
+
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x float>, ptr addrspace(7) %p
+ ret <5 x float> %ret
+}
+
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x float>, ptr addrspace(7) %p
+ ret <6 x float> %ret
+}
+
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x float>, ptr addrspace(7) %p
+ ret <7 x float> %ret
+}
+
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x float>, ptr addrspace(7) %p
+ ret <8 x float> %ret
+}
+
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v10f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v10f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <10 x float>, ptr addrspace(7) %p
+ ret <10 x float> %ret
+}
+
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v10f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v10f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <10 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i32>, ptr addrspace(7) %p
+ ret <6 x i32> %ret
+}
+
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
+ ret <4 x ptr addrspace(1)> %ret
+}
+
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
+
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i16>, ptr addrspace(7) %p
+ ret <1 x i16> %ret
+}
+
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i16>, ptr addrspace(7) %p
+ ret <3 x i16> %ret
+}
+
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x i16>, ptr addrspace(7) %p
+ ret <5 x i16> %ret
+}
+
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i16>, ptr addrspace(7) %p
+ ret <6 x i16> %ret
+}
+
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x i16>, ptr addrspace(7) %p
+ ret <7 x i16> %ret
+}
+
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v9i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v9i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT: v_bfi_b32 v3, s4, v3, v3
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <9 x i16>, ptr addrspace(7) %p
+ ret <9 x i16> %ret
+}
+
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v9i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v9i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <9 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Byte vectors. Need to be
+;;; - Split into multiple operations
+;;; - Bitcast if they have a natively supported width
+
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i8>, ptr addrspace(7) %p
+ ret <1 x i8> %ret
+}
+
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i8>, ptr addrspace(7) %p
+ ret <2 x i8> %ret
+}
+
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i8>, ptr addrspace(7) %p
+ ret <3 x i8> %ret
+}
+
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i8>, ptr addrspace(7) %p
+ ret <4 x i8> %ret
+}
+
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, 8
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x i8>, ptr addrspace(7) %p
+ ret <5 x i8> %ret
+}
+
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v6, 8
+; GISEL-NEXT: v_mov_b32_e32 v5, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v5, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v6
+; SDAG-NEXT: v_mov_b32_e32 v1, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i8>, ptr addrspace(7) %p
+ ret <6 x i8> %ret
+}
+
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x i8>, ptr addrspace(7) %p
+ ret <7 x i8> %ret
+}
+
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v8, 8
+; GISEL-NEXT: v_mov_b32_e32 v7, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: v_and_b32_e32 v0, 0xff, v5
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, v8
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, v8
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i8>, ptr addrspace(7) %p
+ ret <8 x i8> %ret
+}
+
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx2 v[3:4], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v9, 8
+; GISEL-NEXT: v_mov_b32_e32 v8, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v8, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v12i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, v14
+; SDAG-NEXT: v_mov_b32_e32 v2, v13
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v12i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v12
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <12 x i8>, ptr addrspace(7) %p
+ ret <12 x i8> %ret
+}
+
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v12i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx3 v[6:8], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v12i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v13, 8
+; GISEL-NEXT: v_mov_b32_e32 v12, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v12, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v12, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <12 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v16i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[18:19], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
+; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v12, v3
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v16
+; SDAG-NEXT: v_mov_b32_e32 v3, v18
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v16i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v16, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v18, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v12, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v16
+; GISEL-NEXT: v_mov_b32_e32 v2, v17
+; GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <16 x i8>, ptr addrspace(7) %p
+ ret <16 x i8> %ret
+}
+
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v16i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[9:12], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v16i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v17, 8
+; GISEL-NEXT: v_mov_b32_e32 v16, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v16, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT: v_and_or_b32 v3, v12, v16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <16 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[33:36], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[48:51], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[33:34]
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[35:36]
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[48:49]
+; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[50:51]
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v33
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v33
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v35
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v35
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v48
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v48
+; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v50
+; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v50
+; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v51
+; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v51
+; SDAG-NEXT: v_mov_b32_e32 v0, v33
+; SDAG-NEXT: v_mov_b32_e32 v4, v34
+; SDAG-NEXT: v_mov_b32_e32 v8, v35
+; SDAG-NEXT: v_mov_b32_e32 v12, v36
+; SDAG-NEXT: v_mov_b32_e32 v16, v48
+; SDAG-NEXT: v_mov_b32_e32 v20, v49
+; SDAG-NEXT: v_mov_b32_e32 v24, v50
+; SDAG-NEXT: v_mov_b32_e32 v28, v51
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v35, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v37, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v32, 8, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 24, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v21, 8, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v23, 24, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v25, 8, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v27, 24, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v29, 8, v19
+; GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v19
+; GISEL-NEXT: v_lshrrev_b32_e32 v31, 24, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v12, v3
+; GISEL-NEXT: v_mov_b32_e32 v20, v17
+; GISEL-NEXT: v_mov_b32_e32 v24, v18
+; GISEL-NEXT: v_mov_b32_e32 v28, v19
+; GISEL-NEXT: v_mov_b32_e32 v1, v35
+; GISEL-NEXT: v_mov_b32_e32 v2, v36
+; GISEL-NEXT: v_mov_b32_e32 v3, v37
+; GISEL-NEXT: v_mov_b32_e32 v17, v32
+; GISEL-NEXT: v_mov_b32_e32 v18, v33
+; GISEL-NEXT: v_mov_b32_e32 v19, v34
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <32 x i8>, ptr addrspace(7) %p
+ ret <32 x i8> %ret
+}
+
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_load_ubyte v14, off, s[0:3], s32
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v29
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v25
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v27
+; SDAG-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v21
+; SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v23
+; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v17
+; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v19
+; SDAG-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v14
+; SDAG-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v31, 8
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_mov_b32_e32 v32, 0xff
+; GISEL-NEXT: v_and_or_b32 v0, v0, v32, v1
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[0:3], s32
+; GISEL-NEXT: v_and_or_b32 v1, v4, v32, v1
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v32, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT: v_and_or_b32 v3, v12, v32, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v18
+; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GISEL-NEXT: v_and_or_b32 v4, v16, v32, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_or3_b32 v4, v4, v5, v6
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v22
+; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v23
+; GISEL-NEXT: v_and_or_b32 v8, v20, v32, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT: v_or3_b32 v5, v8, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v26
+; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v27
+; GISEL-NEXT: v_and_or_b32 v6, v24, v32, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; GISEL-NEXT: v_or3_b32 v6, v6, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v30
+; GISEL-NEXT: v_and_or_b32 v8, v28, v32, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GISEL-NEXT: v_or3_b32 v7, v8, v9, v7
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <32 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Arrays. Need to become vectors.
+
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [1 x i32], ptr addrspace(7) %p
+ ret [1 x i32] %ret
+}
+
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [1 x i32] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x i32], ptr addrspace(7) %p
+ ret [2 x i32] %ret
+}
+
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x i32] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x half], ptr addrspace(7) %p
+ ret [2 x half] %ret
+}
+
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0x5040100
+; SDAG-NEXT: v_perm_b32 v0, v1, v0, s4
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x half] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
+ ret [2 x ptr addrspace(1)] %ret
+}
+
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Scalars of atypical width. Need to be cast to vectors and split.
+
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i40:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i40:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i40, ptr addrspace(7) %p
+ ret i40 %ret
+}
+
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i40:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i40:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i40 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i96:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i96:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i96, ptr addrspace(7) %p
+ ret i96 %ret
+}
+
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i96:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i96:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i96 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i160:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_mov_b32 s4, s33
+; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT: s_addk_i32 s32, 0x1800
+; SDAG-NEXT: s_addk_i32 s32, 0xe800
+; SDAG-NEXT: s_mov_b32 s33, s4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i160:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i160, ptr addrspace(7) %p
+ ret i160 %ret
+}
+
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i160:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, s33
+; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT: s_addk_i32 s32, 0x1000
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_addk_i32 s32, 0xf000
+; SDAG-NEXT: s_mov_b32 s33, s4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i160:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i160 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i256:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i256:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i256, ptr addrspace(7) %p
+ ret i256 %ret
+}
+
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i256:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i256:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i256 %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Non-byte-sized scalars. Require zero-extension.
+
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i7:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i7:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i7, ptr addrspace(7) %p
+ ret i7 %ret
+}
+
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i7:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i7:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i7 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i4, ptr addrspace(7) %p
+ ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i4 %data, ptr addrspace(7) %p
+ ret void
+}
+
+
+;;; Byte-sized vectors of i4. Require casts.
+
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT: v_lshrrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i4>, ptr addrspace(7) %p
+ ret <2 x i4> %ret
+}
+
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ushort v1, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b16_e32 v4, 4, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT: v_lshrrev_b16_e32 v3, 12, v1
+; SDAG-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i4>, ptr addrspace(7) %p
+ ret <4 x i4> %ret
+}
+
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, 15
+; SDAG-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 12, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 12, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v7, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v7
+; SDAG-NEXT: v_bfe_u32 v1, v7, 4, 4
+; SDAG-NEXT: v_bfe_u32 v2, v7, 8, 4
+; SDAG-NEXT: v_bfe_u32 v3, v7, 12, 4
+; SDAG-NEXT: v_bfe_u32 v4, v7, 16, 4
+; SDAG-NEXT: v_bfe_u32 v5, v7, 20, 4
+; SDAG-NEXT: v_bfe_u32 v6, v7, 24, 4
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 28, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i4>, ptr addrspace(7) %p
+ ret <8 x i4> %ret
+}
+
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; SDAG-NEXT: v_and_or_b32 v0, v0, 15, v1
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v3
+; SDAG-NEXT: v_and_b32_e32 v2, 15, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 12, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v5
+; SDAG-NEXT: v_mov_b32_e32 v2, 15
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 20, v1
+; SDAG-NEXT: v_and_b32_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: v_or3_b32 v0, v0, v3, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 28, v7
+; SDAG-NEXT: v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GISEL-NEXT: v_and_or_b32 v0, v0, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 12, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_and_b32_e32 v3, 15, v5
+; GISEL-NEXT: v_and_b32_sdwa v2, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v7
+; GISEL-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 28, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Vectors of non-byte-sized integers.
+
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 63, v1
+; SDAG-NEXT: v_bfe_u32 v1, v1, 6, 6
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b16_e32 v1, 6, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i6>, ptr addrspace(7) %p
+ ret <2 x i6> %ret
+}
+
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 6, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 63, v0
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 63, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 63, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 6, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i6> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;; Blocks of fp6 elements
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <32 x i6>, ptr addrspace(7) %p
+ %ret.cast = bitcast <32 x i6> %ret to <6 x i32>
+ ret <6 x i32> %ret.cast
+}
+
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %data = bitcast <6 x i32> %data.abi to <32 x i6>
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <32 x i6> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Modifiers
+
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load volatile <4 x i8>, ptr addrspace(7) %p
+ ret <4 x i8> %ret
+}
+
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, 8
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store volatile <4 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT: buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v6
+; SDAG-NEXT: v_mov_b32_e32 v1, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load volatile <6 x i8>, ptr addrspace(7) %p
+ ret <6 x i8> %ret
+}
+
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store volatile <6 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+ ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+ ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { i32 }, ptr addrspace(7) %p
+ ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { i32 } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_sl_f32ss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_sl_f32ss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { { float } }, ptr addrspace(7) %p
+ ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_sl_f32ss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_sl_f32ss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { { float } } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_v2i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_v2i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { <2 x i32> }, ptr addrspace(7) %p
+ ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_v2i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_v2i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { <2 x i32> } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i64i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i64i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { i64, i32 }, ptr addrspace(7) %p
+ ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i64i32s:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i64i32s:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { i64, i32 } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a4i7:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a4i7:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT: buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [4 x i7], ptr addrspace(7) %p
+ ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a4i7:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v1
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:1
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v2
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v3
+; SDAG-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:3
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a4i7:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v1
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:1
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v2
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v3
+; GISEL-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:3
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [4 x i7] %data, ptr addrspace(7) %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
new file mode 100644
index 00000000000000..a91d38a58a1e9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
@@ -0,0 +1,11 @@
+; Note: The exact error messages aren't important here, but are included to catch
+; anything changing.
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=SDAG
+; SDAG: LLVM ERROR: Scalarization of scalable vectors is not supported.
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=GISEL
+; GISEL: LLVM ERROR: Invalid size request on a scalable vector.
+
+define void @buffer_store_nxv2i32(ptr addrspace(8) inreg %rsrc, i32 %offset) {
+ call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> poison, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 6f0d51a0277380..022094bc633c88 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -91,7 +91,12 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32
; CHECK-NEXT: [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160
; CHECK-NEXT: [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[V_INT]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[V_INT_LEGAL:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
+; CHECK-NEXT: [[V_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
+; CHECK-NEXT: [[V_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_LEGAL]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
%v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index 5b225636b120a4..d18f0f8bd1ff93 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1,13 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+; Note: if you're adding tests here, also add them to
+; buffer-fat-pointers-contents-legalization.ll to make sure the output of this
+; transformation can codegen.
+
target triple = "amdgcn--"
;;; Legal types. These are natively supported, no casts should be performed.
-define i8 @load_i8(ptr addrspace(8) %buf) {
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i8 @load_i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret i8 [[RET]]
;
@@ -16,9 +20,9 @@ define i8 @load_i8(ptr addrspace(8) %buf) {
ret i8 %ret
}
-define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i8(
-; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -27,9 +31,9 @@ define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
ret void
}
-define i16 @load_i16(ptr addrspace(8) %buf) {
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i16 @load_i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret i16 [[RET]]
;
@@ -38,9 +42,9 @@ define i16 @load_i16(ptr addrspace(8) %buf) {
ret i16 %ret
}
-define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i16(
-; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -49,9 +53,9 @@ define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
ret void
}
-define i32 @load_i32(ptr addrspace(8) %buf) {
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i32 @load_i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret i32 [[RET]]
;
@@ -60,9 +64,9 @@ define i32 @load_i32(ptr addrspace(8) %buf) {
ret i32 %ret
}
-define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i32(
-; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -71,9 +75,9 @@ define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
ret void
}
-define i64 @load_i64(ptr addrspace(8) %buf) {
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i64 @load_i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret i64 [[RET]]
;
@@ -82,9 +86,9 @@ define i64 @load_i64(ptr addrspace(8) %buf) {
ret i64 %ret
}
-define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i64(
-; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -93,9 +97,9 @@ define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
ret void
}
-define i128 @load_i128(ptr addrspace(8) %buf) {
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i128 @load_i128(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call i128 @llvm.amdgcn.raw.ptr.buffer.load.i128(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret i128 [[RET]]
;
@@ -104,9 +108,9 @@ define i128 @load_i128(ptr addrspace(8) %buf) {
ret i128 %ret
}
-define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i128(
-; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i128(i128 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -115,10 +119,11 @@ define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
ret void
}
-define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <1 x i32> @load_v1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
; CHECK-NEXT: ret <1 x i32> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -126,10 +131,11 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
ret <1 x i32> %ret
}
-define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v1i32(
-; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -137,9 +143,9 @@ define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i32> @load_v2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x i32> [[RET]]
;
@@ -148,9 +154,9 @@ define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
ret <2 x i32> %ret
}
-define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i32(
-; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -159,9 +165,9 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
ret void
}
-define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i32> @load_v3i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <3 x i32> [[RET]]
;
@@ -170,9 +176,9 @@ define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
ret <3 x i32> %ret
}
-define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3i32(
-; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -181,9 +187,9 @@ define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i32> @load_v4i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <4 x i32> [[RET]]
;
@@ -192,9 +198,9 @@ define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
ret <4 x i32> %ret
}
-define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i32(
-; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -203,9 +209,9 @@ define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i16> @load_v2i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x i16> [[RET]]
;
@@ -214,9 +220,9 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
ret <2 x i16> %ret
}
-define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i16(
-; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -225,9 +231,9 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i16> @load_v4i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <4 x i16> [[RET]]
;
@@ -236,9 +242,9 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
ret <4 x i16> %ret
}
-define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i16(
-; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -247,9 +253,9 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i16> @load_v8i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <8 x i16> [[RET]]
;
@@ -258,9 +264,9 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
ret <8 x i16> %ret
}
-define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i16(
-; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -269,9 +275,9 @@ define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i64> @load_v2i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x i64> [[RET]]
;
@@ -280,9 +286,9 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
ret <2 x i64> %ret
}
-define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i64(
-; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -291,9 +297,9 @@ define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
ret void
}
-define half @load_f16(ptr addrspace(8) %buf) {
+define half @load_f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define half @load_f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret half [[RET]]
;
@@ -302,9 +308,9 @@ define half @load_f16(ptr addrspace(8) %buf) {
ret half %ret
}
-define void @store_f16(half %data, ptr addrspace(8) %buf) {
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_f16(
-; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -313,9 +319,9 @@ define void @store_f16(half %data, ptr addrspace(8) %buf) {
ret void
}
-define bfloat @load_bf16(ptr addrspace(8) %buf) {
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define bfloat @load_bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret bfloat [[RET]]
;
@@ -324,9 +330,9 @@ define bfloat @load_bf16(ptr addrspace(8) %buf) {
ret bfloat %ret
}
-define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_bf16(
-; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -335,9 +341,9 @@ define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x half> @load_v2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x half> [[RET]]
;
@@ -346,9 +352,9 @@ define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
ret <2 x half> %ret
}
-define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2f16(
-; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -357,9 +363,9 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <4 x bfloat> [[RET]]
;
@@ -368,9 +374,9 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
ret <4 x bfloat> %ret
}
-define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4bf16(
-; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -379,9 +385,9 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
ret void
}
-define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x half> @load_v8f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <8 x half> [[RET]]
;
@@ -390,9 +396,9 @@ define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
ret <8 x half> %ret
}
-define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8f16(
-; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -401,9 +407,9 @@ define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
ret void
}
-define float @load_f32(ptr addrspace(8) %buf) {
+define float @load_f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define float @load_f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret float [[RET]]
;
@@ -412,9 +418,9 @@ define float @load_f32(ptr addrspace(8) %buf) {
ret float %ret
}
-define void @store_f32(float %data, ptr addrspace(8) %buf) {
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_f32(
-; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -423,9 +429,9 @@ define void @store_f32(float %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x float> @load_v2f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x float> [[RET]]
;
@@ -434,9 +440,9 @@ define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
ret <2 x float> %ret
}
-define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2f32(
-; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -445,9 +451,9 @@ define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x float> @load_v3f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <3 x float> [[RET]]
;
@@ -456,9 +462,9 @@ define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
ret <3 x float> %ret
}
-define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3f32(
-; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -467,9 +473,9 @@ define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x float> @load_v4f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <4 x float> [[RET]]
;
@@ -478,9 +484,9 @@ define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
ret <4 x float> %ret
}
-define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4f32(
-; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -489,9 +495,9 @@ define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr @load_p0(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr [[RET]]
;
@@ -500,9 +506,9 @@ define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
ret ptr addrspace(0) %ret
}
-define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p0(
-; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -511,9 +517,9 @@ define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(1) @load_p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(1) [[RET]]
;
@@ -522,9 +528,9 @@ define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
ret ptr addrspace(1) %ret
}
-define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p1(
-; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -533,9 +539,9 @@ define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(2) @load_p2(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(2) [[RET]]
;
@@ -544,9 +550,9 @@ define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
ret ptr addrspace(2) %ret
}
-define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p2(
-; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p2(ptr addrspace(2) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -555,9 +561,9 @@ define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(3) @load_p3(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(3) [[RET]]
;
@@ -566,9 +572,9 @@ define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
ret ptr addrspace(3) %ret
}
-define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p3(
-; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p3(ptr addrspace(3) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -577,9 +583,9 @@ define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(4) @load_p4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(4) [[RET]]
;
@@ -588,9 +594,9 @@ define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
ret ptr addrspace(4) %ret
}
-define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p4(
-; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -599,9 +605,9 @@ define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(5) @load_p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(5) [[RET]]
;
@@ -610,9 +616,9 @@ define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
ret ptr addrspace(5) %ret
}
-define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p5(
-; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -621,9 +627,9 @@ define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(6) @load_p6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(6) @llvm.amdgcn.raw.ptr.buffer.load.p6(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(6) [[RET]]
;
@@ -632,9 +638,9 @@ define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
ret ptr addrspace(6) %ret
}
-define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p6(
-; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p6(ptr addrspace(6) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -643,9 +649,9 @@ define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
ret void
}
-define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define ptr addrspace(8) @load_p8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.raw.ptr.buffer.load.p8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret ptr addrspace(8) [[RET]]
;
@@ -654,9 +660,9 @@ define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
ret ptr addrspace(8) %ret
}
-define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_p8(
-; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p8(ptr addrspace(8) [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -665,9 +671,9 @@ define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x ptr addrspace(1)> @load_v2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x ptr addrspace(1)> [[RET]]
;
@@ -676,9 +682,9 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
ret <2 x ptr addrspace(1)> %ret
}
-define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2p1(
-; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -687,9 +693,9 @@ define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x ptr addrspace(5)> @load_v2p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <2 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v2p5(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <2 x ptr addrspace(5)> [[RET]]
;
@@ -698,9 +704,9 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
ret <2 x ptr addrspace(5)> %ret
}
-define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2p5(
-; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p5(<2 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -709,9 +715,9 @@ define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
ret void
}
-define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x ptr addrspace(5)> @load_v3p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <3 x ptr addrspace(5)> [[RET]]
;
@@ -720,9 +726,9 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
ret <3 x ptr addrspace(5)> %ret
}
-define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3p5(
-; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -731,9 +737,9 @@ define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x ptr addrspace(5)> @load_v4p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[RET:%.*]] = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret <4 x ptr addrspace(5)> [[RET]]
;
@@ -742,9 +748,9 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
ret <4 x ptr addrspace(5)> %ret
}
-define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4p5(
-; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
@@ -755,10 +761,11 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
-define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x half> @load_v6f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
; CHECK-NEXT: ret <6 x half> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -766,10 +773,11 @@ define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
ret <6 x half> %ret
}
-define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6f16(
-; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -779,10 +787,14 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
;;; Long types (32 bit elements). Must be split into multiple operations.
-define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <5 x float> @load_v5f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x float> poison, <5 x float> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_OFF_16]], i64 4
; CHECK-NEXT: ret <5 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -790,10 +802,13 @@ define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
ret <5 x float> %ret
}
-define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v5f32(
-; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x float> [[DATA]], <5 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x float> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -801,10 +816,15 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x float> @load_v6f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x float> poison, <6 x float> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_OFF_16]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <6 x float> [[RET_PARTS_0]], <6 x float> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: ret <6 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -812,10 +832,13 @@ define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
ret <6 x float> %ret
}
-define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6f32(
-; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -823,10 +846,15 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <7 x float> @load_v7f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x float> poison, <7 x float> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_OFF_16]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <7 x float> [[RET_PARTS_0]], <7 x float> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 9>
; CHECK-NEXT: ret <7 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -834,10 +862,13 @@ define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
ret <7 x float> %ret
}
-define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v7f32(
-; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -845,10 +876,15 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x float> @load_v8f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x float> poison, <8 x float> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <8 x float> [[RET_PARTS_0]], <8 x float> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: ret <8 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -856,10 +892,13 @@ define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
ret <8 x float> %ret
}
-define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8f32(
-; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -867,10 +906,18 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <10 x float> @load_v10f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x float> poison, <10 x float> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x float> [[RET_PARTS_0]], <10 x float> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; CHECK-NEXT: [[RET_OFF_32:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_OFF_32]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <10 x float> [[RET_PARTS_4]], <10 x float> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
; CHECK-NEXT: ret <10 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -878,10 +925,15 @@ define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
ret <10 x float> %ret
}
-define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v10f32(
-; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <2 x i32> <i32 8, i32 9>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_8]], ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -889,10 +941,15 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
ret void
}
-define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i32> @load_v6i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: ret <6 x i32> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -900,10 +957,13 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
ret <6 x i32> %ret
}
-define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6i32(
-; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -911,10 +971,15 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <4 x ptr addrspace(1)> poison, <4 x ptr addrspace(1)> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_16]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <4 x ptr addrspace(1)> [[RET_PARTS_0]], <4 x ptr addrspace(1)> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x ptr addrspace(1)> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -922,10 +987,13 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
ret <4 x ptr addrspace(1)> %ret
}
-define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4p1(
-; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_2]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -935,10 +1003,11 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
-define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <1 x i16> @load_v1i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <1 x i16>
; CHECK-NEXT: ret <1 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -946,10 +1015,11 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
ret <1 x i16> %ret
}
-define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v1i16(
-; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i16> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_STORABLE]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -957,10 +1027,14 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i16> @load_v3i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
; CHECK-NEXT: ret <3 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -968,10 +1042,13 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
ret <3 x i16> %ret
}
-define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3i16(
-; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -979,10 +1056,14 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <5 x i16> @load_v5i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_OFF_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_8]], i64 4
; CHECK-NEXT: ret <5 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -990,10 +1071,13 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
ret <5 x i16> %ret
}
-define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v5i16(
-; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1001,10 +1085,11 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i16> @load_v6i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
; CHECK-NEXT: ret <6 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1012,10 +1097,11 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
ret <6 x i16> %ret
}
-define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6i16(
-; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1023,10 +1109,15 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <7 x i16> @load_v7i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_OFF_0]] to <6 x i16>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_OFF_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
+; CHECK-NEXT: [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_12]], i64 6
; CHECK-NEXT: ret <7 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1034,10 +1125,14 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
ret <7 x i16> %ret
}
-define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v7i16(
-; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i16> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1045,10 +1140,14 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
ret void
}
-define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <9 x i16> @load_v9i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_OFF_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_16]], i64 8
; CHECK-NEXT: ret <9 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1056,10 +1155,13 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
ret <9 x i16> %ret
}
-define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v9i16(
-; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1071,10 +1173,11 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
;;; - Split into multiple operations
;;; - Bitcast if they have a natively supported width
-define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <1 x i8> @load_v1i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <1 x i8>
; CHECK-NEXT: ret <1 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1082,10 +1185,11 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
ret <1 x i8> %ret
}
-define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v1i8(
-; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <1 x i8> [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1093,10 +1197,11 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i8> @load_v2i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
; CHECK-NEXT: ret <2 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1104,10 +1209,11 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
ret <2 x i8> %ret
}
-define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i8(
-; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1115,10 +1221,15 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <3 x i8> @load_v3i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_0]] to <2 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_OFF_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_2]], i64 2
; CHECK-NEXT: ret <3 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1126,10 +1237,14 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
ret <3 x i8> %ret
}
-define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v3i8(
-; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1137,10 +1252,11 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i8> @load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1148,10 +1264,11 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
ret <4 x i8> %ret
}
-define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1159,10 +1276,15 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <5 x i8> @load_v5i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
; CHECK-NEXT: ret <5 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1170,10 +1292,14 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
ret <5 x i8> %ret
}
-define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v5i8(
-; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1181,10 +1307,15 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i8> @load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1192,10 +1323,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
ret <6 x i8> %ret
}
-define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1203,10 +1338,19 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <7 x i8> @load_v7i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_4]] to <2 x i8>
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_OFF_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
+; CHECK-NEXT: [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_OFF_6]], i64 6
; CHECK-NEXT: ret <7 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1214,10 +1358,17 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
ret <7 x i8> %ret
}
-define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v7i8(
-; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[DATA_SLICE_4_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_4]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1225,10 +1376,11 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i8> @load_v8i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
; CHECK-NEXT: ret <8 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1236,10 +1388,11 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
ret <8 x i8> %ret
}
-define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i8(
-; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1247,10 +1400,11 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <12 x i8> @load_v12i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
; CHECK-NEXT: ret <12 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1258,10 +1412,11 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
ret <12 x i8> %ret
}
-define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v12i8(
-; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1269,10 +1424,11 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <16 x i8> @load_v16i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1280,10 +1436,11 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
ret <16 x i8> %ret
}
-define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v16i8(
-; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1291,10 +1448,16 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <32 x i8> @load_v32i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
; CHECK-NEXT: ret <32 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1302,10 +1465,14 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
ret <32 x i8> %ret
}
-define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v32i8(
-; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1315,10 +1482,13 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
;;; Arrays. Need to become vectors.
-define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [1 x i32] @load_a1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <1 x i32> [[RET_FROM_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [1 x i32] poison, i32 [[RET_ELEM_0]], 0
; CHECK-NEXT: ret [1 x i32] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1326,10 +1496,13 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
ret [1 x i32] %ret
}
-define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_a1i32(
-; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [1 x i32] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <1 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA_AS_VEC_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1337,10 +1510,14 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
ret void
}
-define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x i32] @load_a2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x i32] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1348,10 +1525,14 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
ret [2 x i32] %ret
}
-define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_a2i32(
-; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA_AS_VEC_0]], i32 [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1359,10 +1540,14 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
ret void
}
-define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x half] @load_a2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x half] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1370,10 +1555,14 @@ define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
ret [2 x half] %ret
}
-define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_a2f16(
-; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x half] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1381,10 +1570,14 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
ret void
}
-define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x ptr addrspace(1)] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1392,10 +1585,14 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
ret [2 x ptr addrspace(1)] %ret
}
-define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_a2p1(
-; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x ptr addrspace(1)> [[DATA_AS_VEC_0]], ptr addrspace(1) [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_AS_VEC_1]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1405,10 +1602,16 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
;;; Scalars of atypical width. Need to be cast to vectors and split.
-define i40 @load_i40(ptr addrspace(8) %buf) {
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i40 @load_i40(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT: [[RET:%.*]] = bitcast <5 x i8> [[RET_SLICE_4]] to i40
; CHECK-NEXT: ret i40 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1416,10 +1619,15 @@ define i40 @load_i40(ptr addrspace(8) %buf) {
ret i40 %ret
}
-define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i40(
-; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i40 [[DATA]] to <5 x i8>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA_LEGAL]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1427,10 +1635,11 @@ define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
ret void
}
-define i96 @load_i96(ptr addrspace(8) %buf) {
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i96 @load_i96(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to i96
; CHECK-NEXT: ret i96 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1438,10 +1647,11 @@ define i96 @load_i96(ptr addrspace(8) %buf) {
ret i96 %ret
}
-define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i96(
-; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i96 [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1449,10 +1659,15 @@ define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
ret void
}
-define i160 @load_i160(ptr addrspace(8) %buf) {
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i160 @load_i160(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i32> poison, <5 x i32> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_OFF_16]], i64 4
+; CHECK-NEXT: [[RET:%.*]] = bitcast <5 x i32> [[RET_SLICE_4]] to i160
; CHECK-NEXT: ret i160 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1460,10 +1675,14 @@ define i160 @load_i160(ptr addrspace(8) %buf) {
ret i160 %ret
}
-define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i160(
-; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i160 [[DATA]] to <5 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i32> [[DATA_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i32> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1471,10 +1690,16 @@ define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
ret void
}
-define i256 @load_i256(ptr addrspace(8) %buf) {
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define i256 @load_i256(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to i256
; CHECK-NEXT: ret i256 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1482,10 +1707,14 @@ define i256 @load_i256(ptr addrspace(8) %buf) {
ret i256 %ret
}
-define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_i256(
-; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i256 [[DATA]] to <8 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1495,10 +1724,11 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
;;; Non-byte-sized scalars. Require zero-extension.
-define i7 @load_i4(ptr addrspace(8) %buf) {
-; CHECK-LABEL: define i7 @load_i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define i7 @load_i7(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
; CHECK-NEXT: ret i7 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1506,10 +1736,11 @@ define i7 @load_i4(ptr addrspace(8) %buf) {
ret i7 %ret
}
-define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
-; CHECK-LABEL: define void @store_i4(
-; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_i7(
+; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1517,12 +1748,37 @@ define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
ret void
}
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define i4 @load_i4(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i4
+; CHECK-NEXT: ret i4 [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i4, ptr addrspace(7) %p
+ ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_i4(
+; CHECK-SAME: i4 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_ZEXT:%.*]] = zext i4 [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i4 %data, ptr addrspace(7) %p
+ ret void
+}
+
;;; Byte-sized vectors of i4. Require casts.
-define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i4> @load_v2i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <2 x i4>
; CHECK-NEXT: ret <2 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1530,10 +1786,11 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
ret <2 x i4> %ret
}
-define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i4(
-; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i4> [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1541,10 +1798,11 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
ret void
}
-define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i4> @load_v4i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
; CHECK-NEXT: ret <4 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1552,10 +1810,11 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
ret <4 x i4> %ret
}
-define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v4i4(
-; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1563,10 +1822,11 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
ret void
}
-define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <8 x i4> @load_v8i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
; CHECK-NEXT: ret <8 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1574,10 +1834,11 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
ret <8 x i4> %ret
}
-define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v8i4(
-; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1587,10 +1848,12 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
;;; Vectors of non-byte-sized integers.
-define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <2 x i6> @load_v2i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_TRUNC:%.*]] = trunc i16 [[RET_LOADABLE]] to i12
+; CHECK-NEXT: [[RET:%.*]] = bitcast i12 [[RET_TRUNC]] to <2 x i6>
; CHECK-NEXT: ret <2 x i6> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1598,10 +1861,12 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
ret <2 x i6> %ret
}
-define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v2i6(
-; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_AS_SCALAR:%.*]] = bitcast <2 x i6> [[DATA]] to i12
+; CHECK-NEXT: [[DATA_ZEXT:%.*]] = zext i12 [[DATA_AS_SCALAR]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_ZEXT]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1610,10 +1875,16 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
}
;; Blocks of fp6 elements
-define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i32> @load_v32i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
; CHECK-NEXT: [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
; CHECK-NEXT: ret <6 x i32> [[RET_CAST]]
;
@@ -1623,11 +1894,15 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
ret <6 x i32> %ret.cast
}
-define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v32i6(
-; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%data = bitcast <6 x i32> %data.abi to <32 x i6>
@@ -1638,10 +1913,11 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
;;; Modifiers
-define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1649,10 +1925,11 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
ret <4 x i8> %ret
}
-define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @volatile_store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1660,10 +1937,15 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
ret void
}
-define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1671,13 +1953,257 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
ret <6 x i8> %ret
}
-define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @volatile_store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
store volatile <6 x i8> %data, ptr addrspace(7) %p
ret void
}
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x [2 x i32]] @load_a2a2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET0_ELEM_0:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 0
+; CHECK-NEXT: [[RET0_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET0_ELEM_0]], 0
+; CHECK-NEXT: [[RET0_ELEM_1:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 1
+; CHECK-NEXT: [[RET0_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET0_AS_ARRAY_0]], i32 [[RET0_ELEM_1]], 1
+; CHECK-NEXT: [[RET0:%.*]] = insertvalue [2 x [2 x i32]] poison, [2 x i32] [[RET0_AS_ARRAY_1]], 0
+; CHECK-NEXT: [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET1_ELEM_0:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 0
+; CHECK-NEXT: [[RET1_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET1_ELEM_0]], 0
+; CHECK-NEXT: [[RET1_ELEM_1:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 1
+; CHECK-NEXT: [[RET1_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET1_AS_ARRAY_0]], i32 [[RET1_ELEM_1]], 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x [2 x i32]] [[RET0]], [2 x i32] [[RET1_AS_ARRAY_1]], 1
+; CHECK-NEXT: ret [2 x [2 x i32]] [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+ ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2a2i32(
+; CHECK-SAME: [2 x [2 x i32]] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA0:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 0
+; CHECK-NEXT: [[DATA0_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA0]], 0
+; CHECK-NEXT: [[DATA0_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA0_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA0_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA0]], 1
+; CHECK-NEXT: [[DATA0_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA0_AS_VEC_0]], i32 [[DATA0_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 1
+; CHECK-NEXT: [[DATA1_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA1]], 0
+; CHECK-NEXT: [[DATA1_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA1_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA1_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA1]], 1
+; CHECK-NEXT: [[DATA1_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA1_AS_VEC_0]], i32 [[DATA1_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x <2 x i32>] @load_a2v2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[RET0_OFF_0]], 0
+; CHECK-NEXT: [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x <2 x i32>] [[RET0]], <2 x i32> [[RET1_OFF_8]], 1
+; CHECK-NEXT: ret [2 x <2 x i32>] [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+ ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2v2i32(
+; CHECK-SAME: [2 x <2 x i32>] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA0:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i32 } @load_sl_i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_0_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertvalue { i32 } poison, i32 [[RET_0_OFF_0]], 0
+; CHECK-NEXT: ret { i32 } [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { i32 }, ptr addrspace(7) %p
+ ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i32s(
+; CHECK-SAME: { i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_0:%.*]] = extractvalue { i32 } [[DATA]], 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { i32 } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { { float } } @load_sl_sl_f32ss(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_0_0_OFF_0:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertvalue { { float } } poison, float [[RET_0_0_OFF_0]], 0, 0
+; CHECK-NEXT: ret { { float } } [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { { float } }, ptr addrspace(7) %p
+ ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_sl_f32ss(
+; CHECK-SAME: { { float } } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_0_0:%.*]] = extractvalue { { float } } [[DATA]], 0, 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_0_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { { float } } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { <2 x i32> } @load_sl_v2i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertvalue { <2 x i32> } poison, <2 x i32> [[RET_0_OFF_0]], 0
+; CHECK-NEXT: ret { <2 x i32> } [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { <2 x i32> }, ptr addrspace(7) %p
+ ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_v2i32s(
+; CHECK-SAME: { <2 x i32> } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_0:%.*]] = extractvalue { <2 x i32> } [[DATA]], 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { <2 x i32> } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i64, i32 } @load_sl_i64i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET_0_OFF_0:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_0:%.*]] = insertvalue { i64, i32 } poison, i64 [[RET_0_OFF_0]], 0
+; CHECK-NEXT: [[RET_1_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertvalue { i64, i32 } [[RET_0]], i32 [[RET_1_OFF_8]], 1
+; CHECK-NEXT: ret { i64, i32 } [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load { i64, i32 }, ptr addrspace(7) %p
+ ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i64i32s(
+; CHECK-SAME: { i64, i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA_0:%.*]] = extractvalue { i64, i32 } [[DATA]], 0
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_1:%.*]] = extractvalue { i64, i32 } [[DATA]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store { i64, i32 } %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [4 x i7] @load_a4i7(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET0_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET0_TRUNC:%.*]] = trunc i8 [[RET0_OFF_0]] to i7
+; CHECK-NEXT: [[RET0:%.*]] = insertvalue [4 x i7] poison, i7 [[RET0_TRUNC]], 0
+; CHECK-NEXT: [[RET1_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[RET1_TRUNC:%.*]] = trunc i8 [[RET1_OFF_1]] to i7
+; CHECK-NEXT: [[RET1:%.*]] = insertvalue [4 x i7] [[RET0]], i7 [[RET1_TRUNC]], 1
+; CHECK-NEXT: [[RET2_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET2_TRUNC:%.*]] = trunc i8 [[RET2_OFF_2]] to i7
+; CHECK-NEXT: [[RET2:%.*]] = insertvalue [4 x i7] [[RET1]], i7 [[RET2_TRUNC]], 2
+; CHECK-NEXT: [[RET3_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: [[RET3_TRUNC:%.*]] = trunc i8 [[RET3_OFF_3]] to i7
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [4 x i7] [[RET2]], i7 [[RET3_TRUNC]], 3
+; CHECK-NEXT: ret [4 x i7] [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [4 x i7], ptr addrspace(7) %p
+ ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a4i7(
+; CHECK-SAME: [4 x i7] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[DATA0:%.*]] = extractvalue [4 x i7] [[DATA]], 0
+; CHECK-NEXT: [[DATA0_ZEXT:%.*]] = zext i7 [[DATA0]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA0_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = extractvalue [4 x i7] [[DATA]], 1
+; CHECK-NEXT: [[DATA1_ZEXT:%.*]] = zext i7 [[DATA1]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA1_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT: [[DATA2:%.*]] = extractvalue [4 x i7] [[DATA]], 2
+; CHECK-NEXT: [[DATA2_ZEXT:%.*]] = zext i7 [[DATA2]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA2_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[DATA3:%.*]] = extractvalue [4 x i7] [[DATA]], 3
+; CHECK-NEXT: [[DATA3_ZEXT:%.*]] = zext i7 [[DATA3]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA3_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [4 x i7] %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Scalable vector. This isn't semantically meaningful but shouldn't crash.
+
+define <vscale x 2 x i32> @load_nxv2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <vscale x 2 x i32> @load_nxv2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[RET:%.*]] = call <vscale x 2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.nxv2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret <vscale x 2 x i32> [[RET]]
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <vscale x 2 x i32>, ptr addrspace(7) %p
+ ret <vscale x 2 x i32> %ret
+}
+
+define void @store_nxv2i32(<vscale x 2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_nxv2i32(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <vscale x 2 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 90fc3cf3d72ea3..c3762e2cfff328 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -54,7 +54,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[BUF_PTR_4_PTR_INT]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], i64 4, !dbg [[DBG33]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
; CHECK-NEXT: ret float [[RET]], !dbg [[DBG34:![0-9]+]]
;
%buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20
More information about the llvm-commits
mailing list