[llvm] [AMDGPU] Handle natively unsupported types in addrspace(7) lowering (PR #110572)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 14:12:12 PDT 2024
https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/110572
The current lowering for ptr addrspace(7) assumed that the instruction selector can handle arbtrary LLVM types, which is not the case. Code generation can't deal with
- Values that aren't 8, 16, 32, 64, 96, or 128 bits long
- Aggregates (this commit only handles arrays of scalars, more may come)
- Vectors of more than one byte
- 3-word values that aren't a vector of 3 32-bit values (for axample, a <6 x half>)
This commit adds a buffer contents type legalizer that adds the needed bitcasts, zero-extensions, and splits into subcompnents needed to convert a load or store operation into one that can be successfully lowered through code generation.
In the long run, some of the involved bitcasts (though potentially not the buffer operation splitting) ought to be handled by the instruction legalizer, but SelectionDAG makes this difficult.
It also takes advantage of the new `nuw` flag on `getelementptr` when lowering GEPs to offset additions.
We don't currently plumb through `nsw` on GEPs since that should likely be a separate change and would require declaring what we mean by "the address" in the context of the GEP guarantees.
>From 068913000593b831f2e94c26d63b517069a1a8d9 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 27 Sep 2024 19:18:46 +0000
Subject: [PATCH] [AMDGPU] Handle natively unsupported types in addrspace(7)
lowering
The current lowering for ptr addrspace(7) assumed that the instruction
selector can handle arbtrary LLVM types, which is not the case. Code
generation can't deal with
- Values that aren't 8, 16, 32, 64, 96, or 128 bits long
- Aggregates (this commit only handles arrays of scalars, more may come)
- Vectors of more than one byte
- 3-word values that aren't a vector of 3 32-bit values (for axample, a
<6 x half>)
This commit adds a buffer contents type legalizer that adds the needed
bitcasts, zero-extensions, and splits into subcompnents needed to convert a
load or store operation into one that can be successfully lowered through
code generation.
In the long run, some of the involved bitcasts (though potentially not
the buffer operation splitting) ought to be handled by the instruction
legalizer, but SelectionDAG makes this difficult.
It also takes advantage of the new `nuw` flag on `getelementptr` when
lowering GEPs to offset additions.
We don't currently plumb through `nsw` on GEPs since that should likely
be a separate change and would require declaring what we mean by
"the address" in the context of the GEP guarantees.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 56 +-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 12 +-
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 463 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +-
llvm/lib/Transforms/Utils/Local.cpp | 12 +
...ffer-fat-pointers-contents-legalization.ll | 4871 +++++++++++++++++
.../AMDGPU/lower-buffer-fat-pointers-calls.ll | 9 +-
...ffer-fat-pointers-contents-legalization.ll | 430 +-
...fer-fat-pointers-unoptimized-debug-data.ll | 7 +-
9 files changed, 5743 insertions(+), 119 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 271c8d45fd4a21..1da029444027e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5794,8 +5794,9 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
return Reg;
}
-Register AMDGPULegalizerInfo::fixStoreSourceType(
- MachineIRBuilder &B, Register VData, bool IsFormat) const {
+Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
+ Register VData, LLT MemTy,
+ bool IsFormat) const {
MachineRegisterInfo *MRI = B.getMRI();
LLT Ty = MRI->getType(VData);
@@ -5805,6 +5806,10 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(
if (hasBufferRsrcWorkaround(Ty))
return castBufferRsrcToV4I32(VData, B);
+ if (shouldBitcastLoadStoreType(ST, Ty, MemTy) || Ty.isPointerVector()) {
+ Ty = getBitcastRegisterType(Ty);
+ VData = B.buildBitcast(Ty, VData).getReg(0);
+ }
// Fixup illegal register types for i8 stores.
if (Ty == LLT::scalar(8) || Ty == S16) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
@@ -5822,22 +5827,26 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(
}
bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
+ LegalizerHelper &Helper,
bool IsTyped,
bool IsFormat) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
Register VData = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(VData);
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
const LLT S32 = LLT::scalar(32);
- VData = fixStoreSourceType(B, VData, IsFormat);
- castBufferRsrcArgToV4I32(MI, B, 2);
- Register RSrc = MI.getOperand(2).getReg();
-
MachineMemOperand *MMO = *MI.memoperands_begin();
const int MemSize = MMO->getSize().getValue();
+ LLT MemTy = MMO->getMemoryType();
+
+ VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
+
+ castBufferRsrcArgToV4I32(MI, B, 2);
+ Register RSrc = MI.getOperand(2).getReg();
unsigned ImmOffset;
@@ -5930,10 +5939,13 @@ static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
}
bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
+ LegalizerHelper &Helper,
bool IsFormat,
bool IsTyped) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
+
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
const LLT MemTy = MMO->getMemoryType();
@@ -5982,9 +5994,21 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
// Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
// logic doesn't have to handle that case.
if (hasBufferRsrcWorkaround(Ty)) {
+ Observer.changingInstr(MI);
Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
+ Observer.changedInstr(MI);
Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
}
+ if (shouldBitcastLoadStoreType(ST, Ty, MemTy) || Ty.isPointerVector()) {
+ Ty = getBitcastRegisterType(Ty);
+ Observer.changingInstr(MI);
+ Helper.bitcastDst(MI, Ty, 0);
+ Observer.changedInstr(MI);
+ Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
+ }
+
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
const bool Unpacked = ST.hasUnpackedD16VMem();
@@ -7364,17 +7388,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_ptr_buffer_store:
- return legalizeBufferStore(MI, MRI, B, false, false);
+ return legalizeBufferStore(MI, Helper, false, false);
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
- return legalizeBufferStore(MI, MRI, B, false, true);
+ return legalizeBufferStore(MI, Helper, false, true);
case Intrinsic::amdgcn_raw_tbuffer_store:
case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
- return legalizeBufferStore(MI, MRI, B, true, true);
+ return legalizeBufferStore(MI, Helper, true, true);
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_atomic_buffer_load:
@@ -7383,17 +7407,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_atomic_buffer_load:
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
- return legalizeBufferLoad(MI, MRI, B, false, false);
+ return legalizeBufferLoad(MI, Helper, false, false);
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_buffer_load_format:
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
- return legalizeBufferLoad(MI, MRI, B, true, false);
+ return legalizeBufferLoad(MI, Helper, true, false);
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
- return legalizeBufferLoad(MI, MRI, B, true, true);
+ return legalizeBufferLoad(MI, Helper, true, true);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 84470dc75b60ef..86c15197805d23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -195,15 +195,13 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg, bool ImageStore = false) const;
- Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
+ Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy,
bool IsFormat) const;
- bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsTyped,
- bool IsFormat) const;
- bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsFormat,
- bool IsTyped) const;
+ bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper,
+ bool IsTyped, bool IsFormat) const;
+ bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper,
+ bool IsFormat, bool IsTyped) const;
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 787747e6055805..831474c192526f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -86,6 +86,25 @@
// This phase also records intrinsics so that they can be remangled or deleted
// later.
//
+// ## Buffer contents type legalization
+//
+// The underlying buffer intrinsics only support types up to 128 bits long,
+// and don't support complex types. If buffer operations were
+// standard pointer operations that could be represented as MIR-level loads,
+// this would be handled by the various legalization schemes in instruction
+// selection. However, because we have to do the conversion from `load` and
+// `store` to intrinsics at LLVM IR level, we must perform that legalization
+// ourselves.
+//
+// This involves a combination of
+// - Converting arrays to vectors where possible
+// - Zero-extending things to fill a whole number of bytes
+// - Casting values of types that don't neatly correspond to supported machine
+// value
+// (for example, an i96 or i256) into ones that would work (
+// like <3 x i32> and <8 x i32>, respectively)
+// - Splitting values that are too long (such as aforementioned <8 x i32>) into
+// multiple operations.
//
// ## Splitting pointer structs
//
@@ -218,6 +237,7 @@
#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -551,7 +571,6 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
auto *NLI = cast<LoadInst>(LI.clone());
NLI->mutateType(IntTy);
NLI = IRB.Insert(NLI);
- copyMetadataForLoad(*NLI, LI);
NLI->takeName(&LI);
Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
@@ -576,6 +595,434 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
return true;
}
+namespace {
+/// Convert loads/stores of types that the buffer intrinsics can't handle into
+/// one ore more such loads/stores that consist of legal types.
+///
+/// Do this by
+/// 1. Converting arrays of non-aggregate, byte-sized types into their
+/// correspondinng vectors
+/// 2. Bitcasting unsupported types, namely overly-long scalars and byte
+/// vectors, into vectors of supported types.
+/// 3. Splitting up excessively long reads/writes into multiple operations.
+///
+/// Note that this doesn't handle complex data strucures, but, in the future,
+/// the aggregate load splitter from SROA could be refactored to allow for that
+/// case.
+class LegalizeBufferContentTypesVisitor
+ : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
+ friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
+
+ IRBuilder<> IRB;
+
+ const DataLayout &DL;
+
+ /// If T is [N x U], where U is a scalar type, return the vector type
+ /// <N x U>, otherwise, return T.
+ Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
+ Value *arrayToVector(Value *V, Type *TargetType, StringRef Name);
+ Value *vectorToArray(Value *V, Type *OrigType, StringRef Name);
+
+ /// Break up the loads of a struct into the loads of its components
+
+ /// Convert a vector or scalar type that can't be operated on by buffer
+ /// intrinsics to one that would be legal through bitcasts and/or truncation.
+ /// Uses the wider of i32, i16, or i8 where possible.
+ Type *legalNonAggregateFor(Type *T);
+ Value *makeLegalNonAggregate(Value *V, Type *TargetType, StringRef Name);
+ Value *makeIllegalNonAggregate(Value *V, Type *OrigType, StringRef Name);
+
+ struct Slice {
+ unsigned Offset;
+ unsigned Length;
+ Slice(unsigned Offset, unsigned Length) : Offset(Offset), Length(Length) {}
+ };
+ // Return the [offset, length] pairs into which `T` needs to be cut to form
+ // legal buffer load or store operations. Clears `Slices`. Creates an empty
+ // `Slices` for non-vector inputs and creates one slice if no slicing will be
+ // needed.
+ void getSlices(Type *T, SmallVectorImpl<Slice> &Slices);
+
+ Value *extractSlice(Value *Vec, Slice S, StringRef Name);
+ Value *insertSlice(Value *Whole, Value *Part, Slice S, StringRef Name);
+
+ // In most cases, return `LegalType`. However, when given an input that would
+ // normally be a legal type for the buffer intrinsics to return but that isn't
+ // hooked up through SelectionDAG, return a type of the same width that can be
+ // used with the relevant intrinsics. Specifically, handle the cases:
+ // - <1 x T> => T for all T
+ // - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+ // - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+ // i32>
+ Type *intrinsicTypeFor(Type *LegalType);
+
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitLoadInst(LoadInst &LI);
+ bool visitStoreInst(StoreInst &SI);
+
+public:
+ LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
+ : IRB(Ctx), DL(DL) {}
+ bool processFunction(Function &F);
+};
+} // namespace
+
+Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
+ ArrayType *AT = dyn_cast<ArrayType>(T);
+ if (!AT)
+ return T;
+ Type *ET = AT->getElementType();
+ if (!ET->isSingleValueType() || isa<VectorType>(ET))
+ report_fatal_error(
+ "loading non-scalar arrays from buffer fat pointers is unimplemented");
+ if (!DL.typeSizeEqualsStoreSize(AT))
+ report_fatal_error(
+ "loading padded arrays from buffer fat pinters is unimplemented");
+ return FixedVectorType::get(ET, AT->getNumElements());
+}
+
+Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
+ Type *TargetType,
+ StringRef Name) {
+ Value *VectorRes = PoisonValue::get(TargetType);
+ auto *VT = cast<FixedVectorType>(TargetType);
+ unsigned EC = VT->getNumElements();
+ for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+ Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
+ VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
+ Name + ".as.vec." + Twine(I));
+ }
+ return VectorRes;
+}
+
+Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
+ Type *OrigType,
+ StringRef Name) {
+ Value *ArrayRes = PoisonValue::get(OrigType);
+ ArrayType *AT = cast<ArrayType>(OrigType);
+ unsigned EC = AT->getNumElements();
+ for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+ Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
+ ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
+ Name + ".as.array." + Twine(I));
+ }
+ return ArrayRes;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+ TypeSize Size = DL.getTypeStoreSizeInBits(T);
+ // Implicitly zero-extend to the next byte if needed
+ if (!DL.typeSizeEqualsStoreSize(T))
+ T = IRB.getIntNTy(Size.getFixedValue());
+ auto *VT = dyn_cast<VectorType>(T);
+ Type *ElemTy = T;
+ if (VT) {
+ ElemTy = VT->getElementType();
+ }
+ if (isa<PointerType>(ElemTy))
+ return T; // Pointers are always big enough
+ unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
+ if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+ // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
+ // legal buffer operations.
+ return T;
+ }
+ Type *BestVectorElemType = nullptr;
+ if (Size.isKnownMultipleOf(32))
+ BestVectorElemType = IRB.getInt32Ty();
+ else if (Size.isKnownMultipleOf(16))
+ BestVectorElemType = IRB.getInt16Ty();
+ else
+ BestVectorElemType = IRB.getInt8Ty();
+ unsigned NumCastElems =
+ Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
+ if (NumCastElems == 1)
+ return BestVectorElemType;
+ return FixedVectorType::get(BestVectorElemType, NumCastElems);
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
+ Value *V, Type *TargetType, StringRef Name) {
+ Type *SourceType = V->getType();
+ if (DL.getTypeSizeInBits(SourceType) != DL.getTypeSizeInBits(TargetType)) {
+ Type *ShortScalarTy =
+ IRB.getIntNTy(DL.getTypeSizeInBits(SourceType).getFixedValue());
+ Type *ByteScalarTy =
+ IRB.getIntNTy(DL.getTypeSizeInBits(TargetType).getFixedValue());
+ Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
+ Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
+ V = Zext;
+ SourceType = ByteScalarTy;
+ }
+ if (SourceType == TargetType)
+ return V;
+ return IRB.CreateBitCast(V, TargetType, Name + ".legal");
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
+ Value *V, Type *OrigType, StringRef Name) {
+ Type *LegalType = V->getType();
+ if (DL.getTypeSizeInBits(LegalType) != DL.getTypeSizeInBits(OrigType)) {
+ Type *ShortScalarTy =
+ IRB.getIntNTy(DL.getTypeSizeInBits(OrigType).getFixedValue());
+ Type *ByteScalarTy =
+ IRB.getIntNTy(DL.getTypeSizeInBits(LegalType).getFixedValue());
+ Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
+ Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
+ if (OrigType != ShortScalarTy)
+ return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
+ return Trunc;
+ }
+ if (LegalType == OrigType)
+ return V;
+ return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
+}
+
+Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
+ auto *VT = dyn_cast<FixedVectorType>(LegalType);
+ if (!VT)
+ return LegalType;
+ Type *ET = VT->getElementType();
+ if (VT->getNumElements() == 1)
+ return ET;
+ if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
+ return FixedVectorType::get(IRB.getInt32Ty(), 3);
+ if (ET->isIntegerTy(8)) {
+ switch (VT->getNumElements()) {
+ default:
+ return LegalType; // Let it crash later
+ case 1:
+ return IRB.getInt8Ty();
+ case 2:
+ return IRB.getInt16Ty();
+ case 4:
+ return IRB.getInt32Ty();
+ case 8:
+ return FixedVectorType::get(IRB.getInt32Ty(), 2);
+ case 16:
+ return FixedVectorType::get(IRB.getInt32Ty(), 4);
+ }
+ }
+ return LegalType;
+}
+
+void LegalizeBufferContentTypesVisitor::getSlices(
+ Type *T, SmallVectorImpl<Slice> &Slices) {
+ Slices.clear();
+ auto *VT = dyn_cast<FixedVectorType>(T);
+ if (!VT)
+ return;
+
+ unsigned ElemBitWidth =
+ DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
+
+ unsigned ElemsPer4Words = 128 / ElemBitWidth;
+ unsigned ElemsPer2Words = ElemsPer4Words / 2;
+ unsigned ElemsPerWord = ElemsPer2Words / 2;
+ unsigned ElemsPerShort = ElemsPerWord / 2;
+ unsigned ElemsPerByte = ElemsPerShort / 2;
+ // If the elements evenly pack into 32-bit words, we can use 3-word stores,
+ // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
+ // example, <3 x i64>, since that's not slicing.
+ unsigned ElemsPer3Words = ElemsPerWord * 3;
+
+ unsigned TotalElems = VT->getNumElements();
+ unsigned Off = 0;
+ auto TrySlice = [&](unsigned MaybeLen) {
+ if (MaybeLen > 0 && Off + MaybeLen <= TotalElems) {
+ Slices.emplace_back(/*Offset=*/Off, /*Length=*/MaybeLen);
+ Off += MaybeLen;
+ return true;
+ }
+ return false;
+ };
+ while (Off < TotalElems) {
+ TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
+ TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
+ TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+ }
+}
+
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, Slice S,
+ StringRef Name) {
+ if (S.Length == 1)
+ return IRB.CreateExtractElement(Vec, S.Offset,
+ Name + ".slice." + Twine(S.Offset));
+ SmallVector<int> Mask = llvm::to_vector(llvm::iota_range<int>(
+ S.Offset, S.Offset + S.Length, /*Inclusive=*/false));
+ return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Offset));
+}
+
+Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
+ Slice S, StringRef Name) {
+ if (S.Length == 1) {
+ return IRB.CreateInsertElement(Whole, Part, S.Offset,
+ Name + ".slice." + Twine(S.Offset));
+ }
+ int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
+
+ // Extend the slice with poisons to make the main shufflevector happy.
+ SmallVector<int> ExtPartMask(NumElems, -1);
+ for (auto [I, E] : llvm::enumerate(
+ MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
+ E = I;
+ }
+ Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
+ Name + ".ext." + Twine(S.Offset));
+
+ SmallVector<int> Mask =
+ llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
+ for (auto [I, E] :
+ llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Offset, S.Length)))
+ E = I + NumElems;
+ return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
+ Name + ".parts." + Twine(S.Offset));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+ if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ Type *OrigType = LI.getType();
+ Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+
+ SmallVector<Slice> Slices;
+ getSlices(LegalType, Slices);
+ bool NeedToSplit = Slices.size() > 1;
+ Value *LoadsRes;
+ StringRef Name = LI.getName();
+ if (!NeedToSplit) {
+ Type *LoadableType = intrinsicTypeFor(LegalType);
+ if (LoadableType == OrigType)
+ return false;
+
+ IRB.SetInsertPoint(&LI);
+ auto *NLI = cast<LoadInst>(LI.clone());
+ NLI->mutateType(LoadableType);
+ NLI = IRB.Insert(NLI);
+ NLI->setName(Name + ".loadable");
+
+ LoadsRes = NLI;
+ if (LoadableType != LegalType) {
+ LoadsRes =
+ IRB.CreateBitCast(LoadsRes, LegalType, Name + ".from.loadable");
+ }
+ } else {
+ IRB.SetInsertPoint(&LI);
+ LoadsRes = PoisonValue::get(LegalType);
+ Value *OrigPtr = LI.getPointerOperand();
+ // If we're needing to spill something into more than one load, its legal
+ // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
+ Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+ unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+ AAMDNodes AANodes = LI.getAAMetadata();
+ for (Slice S : Slices) {
+ Type *SliceType =
+ S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+ unsigned ByteOffset = S.Offset * ElemBytes;
+ // You can't reasonably expect loads to wrap around the edge of memory.
+ Value *NewPtr = IRB.CreateGEP(
+ IRB.getInt8Ty(), LI.getPointerOperand(), IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".part.ptr." + Twine(S.Offset),
+ GEPNoWrapFlags::noUnsignedWrap());
+ Type *LoadableType = intrinsicTypeFor(SliceType);
+ LoadInst *NewLI = IRB.CreateAlignedLoad(
+ LoadableType, NewPtr, commonAlignment(LI.getAlign(), ByteOffset),
+ Name + ".part." + Twine(S.Offset));
+ copyMetadataForLoad(*NewLI, LI);
+ NewLI->setAAMetadata(
+ AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
+ if (LI.isAtomic())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+ if (LI.isVolatile())
+ NewLI->setVolatile(LI.isVolatile());
+ Value *Loaded = NewLI;
+ if (LoadableType != SliceType)
+ Loaded = IRB.CreateBitCast(NewLI, SliceType,
+ NewLI->getName() + ".from.loadable");
+ LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
+ }
+ }
+ if (LegalType != ArrayAsVecType)
+ LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
+ if (ArrayAsVecType != OrigType)
+ LoadsRes = vectorToArray(LoadsRes, OrigType, Name);
+ LoadsRes->takeName(&LI);
+ LI.replaceAllUsesWith(LoadsRes);
+ LI.eraseFromParent();
+ return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+ if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+ return false;
+ IRB.SetInsertPoint(&SI);
+ Value *OrigData = SI.getValueOperand();
+ Type *OrigType = OrigData->getType();
+ StringRef Name = OrigData->getName();
+ Value *NewData = OrigData;
+
+ Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
+ if (ArrayAsVecType != OrigType) {
+ NewData = arrayToVector(NewData, ArrayAsVecType, Name);
+ }
+
+ Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+ if (LegalType != ArrayAsVecType) {
+ NewData = makeLegalNonAggregate(NewData, LegalType, Name);
+ }
+
+ SmallVector<Slice> Slices;
+ getSlices(LegalType, Slices);
+ bool NeedToSplit = Slices.size() > 1;
+ if (!NeedToSplit) {
+ Type *StorableType = intrinsicTypeFor(LegalType);
+ if (StorableType == OrigType)
+ return false;
+ if (StorableType != LegalType)
+ NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+
+ SI.setOperand(0, NewData);
+ return true;
+ }
+
+ Value *OrigPtr = SI.getPointerOperand();
+ Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+ unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+ AAMDNodes AANodes = SI.getAAMetadata();
+ for (Slice S : Slices) {
+ Type *SliceType =
+ S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+ unsigned ByteOffset = S.Offset * ElemBytes;
+ Value *NewPtr =
+ IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+ OrigPtr->getName() + ".part." + Twine(S.Offset),
+ GEPNoWrapFlags::noUnsignedWrap());
+ Value *DataSlice = extractSlice(NewData, S, Name);
+ Type *StorableType = intrinsicTypeFor(SliceType);
+ if (StorableType != SliceType) {
+ DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+ DataSlice->getName() + ".storable");
+ }
+ auto *NewSI = cast<StoreInst>(SI.clone());
+ NewSI->setAlignment(commonAlignment(SI.getAlign(), ByteOffset));
+ IRB.Insert(NewSI);
+ NewSI->setOperand(0, DataSlice);
+ NewSI->setOperand(1, NewPtr);
+ NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
+ }
+ SI.eraseFromParent();
+ return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+ bool Changed = false;
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
+ Changed |= visit(I);
+ }
+ return Changed;
+}
+
/// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
/// buffer fat pointer constant.
static std::pair<Constant *, Constant *>
@@ -1256,7 +1703,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
auto [Rsrc, Off] = getPtrParts(Ptr);
const DataLayout &DL = GEP.getDataLayout();
- bool InBounds = GEP.isInBounds();
+ bool IsNUW = GEP.hasNoUnsignedWrap();
// In order to call emitGEPOffset() and thus not have to reimplement it,
// we need the GEP result to have ptr addrspace(7) type.
@@ -1271,16 +1718,12 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return {Rsrc, Off};
}
- bool HasNonNegativeOff = false;
- if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
- HasNonNegativeOff = !CI->isNegative();
- }
Value *NewOff;
if (match(Off, m_Zero())) {
NewOff = OffAccum;
} else {
NewOff = IRB.CreateAdd(Off, OffAccum, "",
- /*hasNUW=*/InBounds && HasNonNegativeOff,
+ /*hasNUW=*/IsNUW,
/*hasNSW=*/false);
}
copyMetadata(NewOff, &GEP);
@@ -1781,12 +2224,16 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
}
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+ LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
+ M.getContext());
for (Function &F : M.functions()) {
bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
bool BodyChanges = containsBufferFatPointers(F, &StructTM);
Changed |= MemOpsRewrite.processFunction(F);
- if (InterfaceChange || BodyChanges)
+ if (InterfaceChange || BodyChanges) {
NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
+ Changed |= BufferContentsTypeRewrite.processFunction(F);
+ }
}
if (NeedsRemap.empty())
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ef9adde13348fe..902feacede83f4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -590,7 +590,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, p1, p4, v4i16, v4f16, v4bf16]>;
def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7659fc69196151..6be6c8161880cf 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3445,6 +3445,8 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
MDBuilder MDB(Dest.getContext());
Type *NewType = Dest.getType();
const DataLayout &DL = Source.getDataLayout();
+ LLVMContext &Ctx = Dest.getContext();
+
for (const auto &MDPair : MD) {
unsigned ID = MDPair.first;
MDNode *N = MDPair.second;
@@ -3488,6 +3490,16 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
copyRangeMetadata(DL, Source, N, Dest);
break;
}
+ // Extended last-use / nontemporal hint on AMD GPUs
+ if (ID == Ctx.getMDKindID("amdpu.last.use"))
+ Dest.setMetadata(ID, N);
+ // Currently only relevant to atomics
+ else if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))
+ Dest.setMetadata(ID, N);
+ else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))
+ Dest.setMetadata(ID, N);
+ else if (ID == Ctx.getMDKindID("amdgpu.ignore.denormal.mode"))
+ Dest.setMetadata(ID, N);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
new file mode 100644
index 00000000000000..b8d01c12b5b180
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -0,0 +1,4871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
+; the lowering.
+
+;;; Legal types. These are natively supported, no casts should be performed.
+
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i8, ptr addrspace(7) %p
+ ret i8 %ret
+}
+
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i8 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i16, ptr addrspace(7) %p
+ ret i16 %ret
+}
+
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i16 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i32, ptr addrspace(7) %p
+ ret i32 %ret
+}
+
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i32 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i64, ptr addrspace(7) %p
+ ret i64 %ret
+}
+
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i64 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i128:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i128:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i128, ptr addrspace(7) %p
+ ret i128 %ret
+}
+
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i128:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i128:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i128 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i32>, ptr addrspace(7) %p
+ ret <1 x i32> %ret
+}
+
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i32>, ptr addrspace(7) %p
+ ret <2 x i32> %ret
+}
+
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i32>, ptr addrspace(7) %p
+ ret <3 x i32> %ret
+}
+
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i32>, ptr addrspace(7) %p
+ ret <4 x i32> %ret
+}
+
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i16>, ptr addrspace(7) %p
+ ret <2 x i16> %ret
+}
+
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i16>, ptr addrspace(7) %p
+ ret <4 x i16> %ret
+}
+
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i16>, ptr addrspace(7) %p
+ ret <8 x i16> %ret
+}
+
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i64>, ptr addrspace(7) %p
+ ret <2 x i64> %ret
+}
+
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i64> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define half @load_f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load half, ptr addrspace(7) %p
+ ret half %ret
+}
+
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store half %data, ptr addrspace(7) %p
+ ret void
+}
+
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load bfloat, ptr addrspace(7) %p
+ ret bfloat %ret
+}
+
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store bfloat %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x half>, ptr addrspace(7) %p
+ ret <2 x half> %ret
+}
+
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x bfloat>, ptr addrspace(7) %p
+ ret <4 x bfloat> %ret
+}
+
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4bf16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4bf16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x bfloat> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x half>, ptr addrspace(7) %p
+ ret <8 x half> %ret
+}
+
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define float @load_f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load float, ptr addrspace(7) %p
+ ret float %ret
+}
+
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store float %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x float>, ptr addrspace(7) %p
+ ret <2 x float> %ret
+}
+
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x float>, ptr addrspace(7) %p
+ ret <3 x float> %ret
+}
+
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x float>, ptr addrspace(7) %p
+ ret <4 x float> %ret
+}
+
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(0), ptr addrspace(7) %p
+ ret ptr addrspace(0) %ret
+}
+
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(0) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(1), ptr addrspace(7) %p
+ ret ptr addrspace(1) %ret
+}
+
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(1) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(2), ptr addrspace(7) %p
+ ret ptr addrspace(2) %ret
+}
+
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(2) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(3), ptr addrspace(7) %p
+ ret ptr addrspace(3) %ret
+}
+
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(3) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(4), ptr addrspace(7) %p
+ ret ptr addrspace(4) %ret
+}
+
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(4) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(5), ptr addrspace(7) %p
+ ret ptr addrspace(5) %ret
+}
+
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(5) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(6), ptr addrspace(7) %p
+ ret ptr addrspace(6) %ret
+}
+
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(6) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load ptr addrspace(8), ptr addrspace(7) %p
+ ret ptr addrspace(8) %ret
+}
+
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store ptr addrspace(8) %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
+ ret <2 x ptr addrspace(1)> %ret
+}
+
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <2 x ptr addrspace(5)> %ret
+}
+
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <3 x ptr addrspace(5)> %ret
+}
+
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
+ ret <4 x ptr addrspace(5)> %ret
+}
+
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p5:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p5:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
+
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x half>, ptr addrspace(7) %p
+ ret <6 x half> %ret
+}
+
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x half> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Long types (32 bit elements). Must be split into multiple operations.
+
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x float>, ptr addrspace(7) %p
+ ret <5 x float> %ret
+}
+
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x float>, ptr addrspace(7) %p
+ ret <6 x float> %ret
+}
+
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x float>, ptr addrspace(7) %p
+ ret <7 x float> %ret
+}
+
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x float>, ptr addrspace(7) %p
+ ret <8 x float> %ret
+}
+
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v10f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v10f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <10 x float>, ptr addrspace(7) %p
+ ret <10 x float> %ret
+}
+
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v10f32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: buffer_store_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v10f32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <10 x float> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i32>, ptr addrspace(7) %p
+ ret <6 x i32> %ret
+}
+
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i32> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
+ ret <4 x ptr addrspace(1)> %ret
+}
+
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
+
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i16>, ptr addrspace(7) %p
+ ret <1 x i16> %ret
+}
+
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i16>, ptr addrspace(7) %p
+ ret <3 x i16> %ret
+}
+
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x i16>, ptr addrspace(7) %p
+ ret <5 x i16> %ret
+}
+
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i16>, ptr addrspace(7) %p
+ ret <6 x i16> %ret
+}
+
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x i16>, ptr addrspace(7) %p
+ ret <7 x i16> %ret
+}
+
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v9i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v9i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_mov_b32 s4, 0xffff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT: v_bfi_b32 v3, s4, v3, v3
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <9 x i16>, ptr addrspace(7) %p
+ ret <9 x i16> %ret
+}
+
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v9i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v9i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <9 x i16> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Byte vectors. Need to be
+;;; - Split into multiple operations
+;;; - Bitcast if they have a natively supported width
+
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <1 x i8>, ptr addrspace(7) %p
+ ret <1 x i8> %ret
+}
+
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <1 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i8>, ptr addrspace(7) %p
+ ret <2 x i8> %ret
+}
+
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <3 x i8>, ptr addrspace(7) %p
+ ret <3 x i8> %ret
+}
+
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_byte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <3 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i8>, ptr addrspace(7) %p
+ ret <4 x i8> %ret
+}
+
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, 8
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <5 x i8>, ptr addrspace(7) %p
+ ret <5 x i8> %ret
+}
+
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_byte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v6, 8
+; GISEL-NEXT: v_mov_b32_e32 v5, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v5, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <5 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v6
+; SDAG-NEXT: v_mov_b32_e32 v1, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <6 x i8>, ptr addrspace(7) %p
+ ret <6 x i8> %ret
+}
+
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <6 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <7 x i8>, ptr addrspace(7) %p
+ ret <7 x i8> %ret
+}
+
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_store_byte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v8, 8
+; GISEL-NEXT: v_mov_b32_e32 v7, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: v_and_b32_e32 v0, 0xff, v5
+; GISEL-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT: buffer_store_byte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <7 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, v8
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, v8
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i8>, ptr addrspace(7) %p
+ ret <8 x i8> %ret
+}
+
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx2 v[3:4], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v9, 8
+; GISEL-NEXT: v_mov_b32_e32 v8, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v8, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v12i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, v14
+; SDAG-NEXT: v_mov_b32_e32 v2, v13
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v12i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v12
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <12 x i8>, ptr addrspace(7) %p
+ ret <12 x i8> %ret
+}
+
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v12i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx3 v[6:8], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v12i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v13, 8
+; GISEL-NEXT: v_mov_b32_e32 v12, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v12, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v12, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <12 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v16i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[18:19], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
+; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v12, v3
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v16
+; SDAG-NEXT: v_mov_b32_e32 v3, v18
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v16i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v16, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v18, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v12, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v16
+; GISEL-NEXT: v_mov_b32_e32 v2, v17
+; GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <16 x i8>, ptr addrspace(7) %p
+ ret <16 x i8> %ret
+}
+
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v16i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[9:12], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v16i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v17, 8
+; GISEL-NEXT: v_mov_b32_e32 v16, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v16, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT: v_and_or_b32 v1, v4, v16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT: v_and_or_b32 v3, v12, v16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <16 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[48:51], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[33:34]
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[35:36]
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[48:49]
+; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[50:51]
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v33
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v33
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v34
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v35
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v35
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v36
+; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v48
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v48
+; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v49
+; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v50
+; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v50
+; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v51
+; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v51
+; SDAG-NEXT: v_mov_b32_e32 v0, v33
+; SDAG-NEXT: v_mov_b32_e32 v4, v34
+; SDAG-NEXT: v_mov_b32_e32 v8, v35
+; SDAG-NEXT: v_mov_b32_e32 v12, v36
+; SDAG-NEXT: v_mov_b32_e32 v16, v48
+; SDAG-NEXT: v_mov_b32_e32 v20, v49
+; SDAG-NEXT: v_mov_b32_e32 v24, v50
+; SDAG-NEXT: v_mov_b32_e32 v28, v51
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v35, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v37, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v32, 8, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 24, v16
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT: v_lshrrev_b32_e32 v21, 8, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v23, 24, v17
+; GISEL-NEXT: v_lshrrev_b32_e32 v25, 8, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v27, 24, v18
+; GISEL-NEXT: v_lshrrev_b32_e32 v29, 8, v19
+; GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v19
+; GISEL-NEXT: v_lshrrev_b32_e32 v31, 24, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v12, v3
+; GISEL-NEXT: v_mov_b32_e32 v20, v17
+; GISEL-NEXT: v_mov_b32_e32 v24, v18
+; GISEL-NEXT: v_mov_b32_e32 v28, v19
+; GISEL-NEXT: v_mov_b32_e32 v1, v35
+; GISEL-NEXT: v_mov_b32_e32 v2, v36
+; GISEL-NEXT: v_mov_b32_e32 v3, v37
+; GISEL-NEXT: v_mov_b32_e32 v17, v32
+; GISEL-NEXT: v_mov_b32_e32 v18, v33
+; GISEL-NEXT: v_mov_b32_e32 v19, v34
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <32 x i8>, ptr addrspace(7) %p
+ ret <32 x i8> %ret
+}
+
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_load_ubyte v14, off, s[0:3], s32
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v29
+; SDAG-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v25
+; SDAG-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v27
+; SDAG-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v21
+; SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v23
+; SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v17
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_lshlrev_b16_e32 v15, 8, v19
+; SDAG-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshlrev_b16_e32 v0, 8, v14
+; SDAG-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dwordx4 v[3:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v31, 8
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_mov_b32_e32 v32, 0xff
+; GISEL-NEXT: v_and_or_b32 v0, v0, v32, v1
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GISEL-NEXT: buffer_load_ubyte v7, off, s[0:3], s32
+; GISEL-NEXT: v_and_or_b32 v1, v4, v32, v1
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT: v_or3_b32 v1, v1, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT: v_and_or_b32 v2, v8, v32, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT: v_and_or_b32 v3, v12, v32, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT: v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT: v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v18
+; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GISEL-NEXT: v_and_or_b32 v4, v16, v32, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_or3_b32 v4, v4, v5, v6
+; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v22
+; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v23
+; GISEL-NEXT: v_and_or_b32 v8, v20, v32, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT: v_or3_b32 v5, v8, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v26
+; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v27
+; GISEL-NEXT: v_and_or_b32 v6, v24, v32, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; GISEL-NEXT: v_or3_b32 v6, v6, v8, v9
+; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v30
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_and_or_b32 v8, v28, v32, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GISEL-NEXT: v_or3_b32 v7, v8, v9, v7
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <32 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Arrays. Need to become vectors.
+
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [1 x i32], ptr addrspace(7) %p
+ ret [1 x i32] %ret
+}
+
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a1i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a1i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [1 x i32] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x i32], ptr addrspace(7) %p
+ ret [2 x i32] %ret
+}
+
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x i32] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x half], ptr addrspace(7) %p
+ ret [2 x half] %ret
+}
+
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0x5040100
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_perm_b32 v0, v1, v0, s4
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x half] %data, ptr addrspace(7) %p
+ ret void
+}
+
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
+ ret [2 x ptr addrspace(1)] %ret
+}
+
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2p1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2p1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Scalars of atypical width. Need to be cast to vectors and split.
+
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i40:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i40:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xff
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i40, ptr addrspace(7) %p
+ ret i40 %ret
+}
+
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i40:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_byte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i40:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i40 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i96:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i96:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i96, ptr addrspace(7) %p
+ ret i96 %ret
+}
+
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i96:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i96:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i96 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i160:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_mov_b32 s4, s33
+; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT: s_addk_i32 s32, 0x1800
+; SDAG-NEXT: s_addk_i32 s32, 0xe800
+; SDAG-NEXT: s_mov_b32 s33, s4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i160:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i160, ptr addrspace(7) %p
+ ret i160 %ret
+}
+
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i160:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, s33
+; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT: s_addk_i32 s32, 0x1000
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_addk_i32 s32, 0xf000
+; SDAG-NEXT: s_mov_b32 s33, s4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i160:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i160 %data, ptr addrspace(7) %p
+ ret void
+}
+
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i256:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i256:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i256, ptr addrspace(7) %p
+ ret i256 %ret
+}
+
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i256:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i256:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i256 %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Non-byte-sized scalars. Require zero-extension.
+
+define i7 @load_i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load i7, ptr addrspace(7) %p
+ ret i7 %ret
+}
+
+define void @store_i4(i7 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT: buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store i7 %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Byte-sized vectors of i4. Require casts.
+
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ubyte v1, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT: v_lshrrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i4>, ptr addrspace(7) %p
+ ret <2 x i4> %ret
+}
+
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_short v0, off, s[0:3], s32
+; SDAG-NEXT: buffer_load_ushort v1, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b16_e32 v4, 4, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT: v_lshrrev_b16_e32 v3, 12, v1
+; SDAG-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <4 x i4>, ptr addrspace(7) %p
+ ret <4 x i4> %ret
+}
+
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_mov_b32_e32 v1, 15
+; SDAG-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 12, v3
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 12, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <4 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v7, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 15, v7
+; SDAG-NEXT: v_bfe_u32 v1, v7, 4, 4
+; SDAG-NEXT: v_bfe_u32 v2, v7, 8, 4
+; SDAG-NEXT: v_bfe_u32 v3, v7, 12, 4
+; SDAG-NEXT: v_bfe_u32 v4, v7, 16, 4
+; SDAG-NEXT: v_bfe_u32 v5, v7, 20, 4
+; SDAG-NEXT: v_bfe_u32 v6, v7, 24, 4
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 28, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <8 x i4>, ptr addrspace(7) %p
+ ret <8 x i4> %ret
+}
+
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; SDAG-NEXT: v_and_or_b32 v0, v0, 15, v1
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v3
+; SDAG-NEXT: v_and_b32_e32 v2, 15, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 12, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT: v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT: v_and_b32_e32 v1, 15, v5
+; SDAG-NEXT: v_mov_b32_e32 v2, 15
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 20, v1
+; SDAG-NEXT: v_and_b32_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: v_or3_b32 v0, v0, v3, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 28, v7
+; SDAG-NEXT: v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GISEL-NEXT: v_and_or_b32 v0, v0, 15, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 15, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 12, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: v_mov_b32_e32 v1, 15
+; GISEL-NEXT: v_and_b32_e32 v3, 15, v5
+; GISEL-NEXT: v_and_b32_sdwa v2, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v3
+; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT: v_and_b32_e32 v2, 15, v7
+; GISEL-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 28, v2
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <8 x i4> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Vectors of non-byte-sized integers.
+
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v0, 63, v1
+; SDAG-NEXT: v_bfe_u32 v1, v1, 6, 6
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b16_e32 v1, 6, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <2 x i6>, ptr addrspace(7) %p
+ ret <2 x i6> %ret
+}
+
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 6, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 63, v0
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 63, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 63, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 6, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <2 x i6> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;; Blocks of fp6 elements
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load <32 x i6>, ptr addrspace(7) %p
+ %ret.cast = bitcast <32 x i6> %ret to <6 x i32>
+ ret <6 x i32> %ret.cast
+}
+
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %data = bitcast <6 x i32> %data.abi to <32 x i6>
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store <32 x i6> %data, ptr addrspace(7) %p
+ ret void
+}
+
+;;; Modifiers
+
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load volatile <4 x i8>, ptr addrspace(7) %p
+ ret <4 x i8> %ret
+}
+
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v4i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v4i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, 8
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store volatile <4 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
+
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
+; SDAG-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:4 glc
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v6
+; SDAG-NEXT: v_mov_b32_e32 v1, v7
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GISEL-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4 glc
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ %ret = load volatile <6 x i8>, ptr addrspace(7) %p
+ ret <6 x i8> %ret
+}
+
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v6i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: s_mov_b32 s11, s17
+; SDAG-NEXT: s_mov_b32 s10, s16
+; SDAG-NEXT: s_mov_b32 s9, s7
+; SDAG-NEXT: s_mov_b32 s8, s6
+; SDAG-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v6i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: s_mov_b32 s4, s6
+; GISEL-NEXT: s_mov_b32 s5, s7
+; GISEL-NEXT: s_mov_b32 s6, s16
+; GISEL-NEXT: s_mov_b32 s7, s17
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+ store volatile <6 x i8> %data, ptr addrspace(7) %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 6f0d51a0277380..7e768982ba4286 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -91,7 +91,12 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32
; CHECK-NEXT: [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160
; CHECK-NEXT: [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[V_INT]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[V_INT_CAST:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
+; CHECK-NEXT: [[V_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
+; CHECK-NEXT: [[V_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_CAST]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
%v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg)
@@ -104,7 +109,7 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
; CHECK-SAME: ({ ptr addrspace(8), i32 } noundef [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
-; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
+; CHECK-NEXT: [[RET:%.*]] = add i32 [[ARG_OFF]], 4
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index 5b225636b120a4..c821d0abfc1f5e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+; Note: if you're adding tests here, also add them to
+; buffer-fat-pointers-contents-legalization.ll to make sure the output of this
+; transformation can codegen.
+
target triple = "amdgcn--"
;;; Legal types. These are natively supported, no casts should be performed.
@@ -118,7 +122,8 @@ define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <1 x i32> @load_v1i32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
; CHECK-NEXT: ret <1 x i32> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -129,7 +134,8 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v1i32(
; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -758,7 +764,8 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x half> @load_v6f16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
; CHECK-NEXT: ret <6 x half> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -769,7 +776,8 @@ define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v6f16(
; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -782,7 +790,11 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <5 x float> @load_v5f32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x float> poison, <5 x float> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_PART_4]], i64 4
; CHECK-NEXT: ret <5 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -793,7 +805,10 @@ define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v5f32(
; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x float> [[DATA]], <5 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x float> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -804,7 +819,12 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x float> @load_v6f32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x float> poison, <6 x float> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_PART_4]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <6 x float> [[RET_PARTS_0]], <6 x float> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: ret <6 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -815,7 +835,10 @@ define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v6f32(
; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -826,7 +849,12 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <7 x float> @load_v7f32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x float> poison, <7 x float> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_PART_4]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <7 x float> [[RET_PARTS_0]], <7 x float> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 9>
; CHECK-NEXT: ret <7 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -837,7 +865,10 @@ define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v7f32(
; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -848,7 +879,12 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <8 x float> @load_v8f32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x float> poison, <8 x float> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <8 x float> [[RET_PARTS_0]], <8 x float> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: ret <8 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -859,7 +895,10 @@ define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v8f32(
; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -870,7 +909,15 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <10 x float> @load_v10f32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <10 x float> poison, <10 x float> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <10 x float> [[RET_PARTS_0]], <10 x float> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; CHECK-NEXT: [[RET_PART_8:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_PART_8]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <10 x float> [[RET_PARTS_4]], <10 x float> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
; CHECK-NEXT: ret <10 x float> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -881,7 +928,12 @@ define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v10f32(
; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <2 x i32> <i32 8, i32 9>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_8]], ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -892,7 +944,12 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x i32> @load_v6i32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
; CHECK-NEXT: ret <6 x i32> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -903,7 +960,10 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v6i32(
; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -914,7 +974,12 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <4 x ptr addrspace(1)> poison, <4 x ptr addrspace(1)> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[RET_PART_2:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_2]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET:%.*]] = shufflevector <4 x ptr addrspace(1)> [[RET_PARTS_0]], <4 x ptr addrspace(1)> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x ptr addrspace(1)> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -925,7 +990,10 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v4p1(
; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_2]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -938,7 +1006,8 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <1 x i16> @load_v1i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <1 x i16>
; CHECK-NEXT: ret <1 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -949,7 +1018,8 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v1i16(
; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i16> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_STORABLE]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -960,7 +1030,11 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <3 x i16> @load_v3i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
; CHECK-NEXT: ret <3 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -971,7 +1045,10 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v3i16(
; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -982,7 +1059,11 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <5 x i16> @load_v5i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_PART_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_PART_4]], i64 4
; CHECK-NEXT: ret <5 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -993,7 +1074,10 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v5i16(
; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1004,7 +1088,8 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x i16> @load_v6i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
; CHECK-NEXT: ret <6 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1015,7 +1100,8 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v6i16(
; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1026,7 +1112,12 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <7 x i16> @load_v7i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_PART_0]] to <6 x i16>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_PART_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
+; CHECK-NEXT: [[RET_PART_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_PART_6]], i64 6
; CHECK-NEXT: ret <7 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1037,7 +1128,11 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v7i16(
; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i16> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1048,7 +1143,11 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <9 x i16> @load_v9i16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_PART_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT: [[RET_PART_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_PART_8]], i64 8
; CHECK-NEXT: ret <9 x i16> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1059,7 +1158,10 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v9i16(
; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1074,7 +1176,8 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <1 x i8> @load_v1i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <1 x i8>
; CHECK-NEXT: ret <1 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1085,7 +1188,8 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v1i8(
; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <1 x i8> [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1096,7 +1200,8 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <2 x i8> @load_v2i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
; CHECK-NEXT: ret <2 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1107,7 +1212,8 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v2i8(
; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1118,7 +1224,12 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <3 x i8> @load_v3i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_0]] to <2 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_PART_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_PART_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_PART_2]], i64 2
; CHECK-NEXT: ret <3 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1129,7 +1240,11 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v3i8(
; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i8> [[DATA]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1140,7 +1255,8 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <4 x i8> @load_v4i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1151,7 +1267,8 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v4i8(
; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1162,7 +1279,12 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <5 x i8> @load_v5i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
; CHECK-NEXT: ret <5 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1173,7 +1295,11 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v5i8(
; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1184,7 +1310,12 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x i8> @load_v6i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1195,7 +1326,11 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v6i8(
; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1206,7 +1341,16 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <7 x i8> @load_v7i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_4]] to <2 x i8>
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_PART_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
+; CHECK-NEXT: [[RET_PART_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_PART_6]], i64 6
; CHECK-NEXT: ret <7 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1217,7 +1361,14 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v7i8(
; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[DATA_SLICE_4_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_4]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_6:%.*]] = extractelement <7 x i8> [[DATA]], i64 6
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1228,7 +1379,8 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <8 x i8> @load_v8i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
; CHECK-NEXT: ret <8 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1239,7 +1391,8 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v8i8(
; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1250,7 +1403,8 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <12 x i8> @load_v12i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
; CHECK-NEXT: ret <12 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1261,7 +1415,8 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v12i8(
; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1272,7 +1427,8 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <16 x i8> @load_v16i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1283,7 +1439,8 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v16i8(
; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1294,7 +1451,13 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <32 x i8> @load_v32i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
; CHECK-NEXT: ret <32 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1305,7 +1468,11 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v32i8(
; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1318,7 +1485,10 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define [1 x i32] @load_a1i32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <1 x i32> [[RET_FROM_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [1 x i32] poison, i32 [[RET_ELEM_0]], 0
; CHECK-NEXT: ret [1 x i32] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1329,7 +1499,10 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_a1i32(
; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [1 x i32] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <1 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA_AS_VEC_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1340,7 +1513,11 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
; CHECK-LABEL: define [2 x i32] @load_a2i32(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x i32] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1351,7 +1528,11 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_a2i32(
; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA_AS_VEC_0]], i32 [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1362,7 +1543,11 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
; CHECK-LABEL: define [2 x half] @load_a2f16(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x half] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1373,7 +1558,11 @@ define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_a2f16(
; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x half] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1384,7 +1573,11 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT: [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
+; CHECK-NEXT: [[RET_ELEM_1:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT: [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_ELEM_1]], 1
; CHECK-NEXT: ret [2 x ptr addrspace(1)] [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1395,7 +1588,11 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_a2p1(
; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_ELEM_0:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 0
+; CHECK-NEXT: [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT: [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
+; CHECK-NEXT: [[DATA_AS_VEC_1:%.*]] = insertelement <2 x ptr addrspace(1)> [[DATA_AS_VEC_0]], ptr addrspace(1) [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_AS_VEC_1]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1408,7 +1605,13 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
define i40 @load_i40(ptr addrspace(8) %buf) {
; CHECK-LABEL: define i40 @load_i40(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
+; CHECK-NEXT: [[RET:%.*]] = bitcast <5 x i8> [[RET_SLICE_4]] to i40
; CHECK-NEXT: ret i40 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1419,7 +1622,12 @@ define i40 @load_i40(ptr addrspace(8) %buf) {
define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_i40(
; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i40 [[DATA]] to <5 x i8>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA_LEGAL]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1430,7 +1638,8 @@ define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
define i96 @load_i96(ptr addrspace(8) %buf) {
; CHECK-LABEL: define i96 @load_i96(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to i96
; CHECK-NEXT: ret i96 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1441,7 +1650,8 @@ define i96 @load_i96(ptr addrspace(8) %buf) {
define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_i96(
; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i96 [[DATA]] to <3 x i32>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1452,7 +1662,12 @@ define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
define i160 @load_i160(ptr addrspace(8) %buf) {
; CHECK-LABEL: define i160 @load_i160(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <5 x i32> poison, <5 x i32> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_PART_4]], i64 4
+; CHECK-NEXT: [[RET:%.*]] = bitcast <5 x i32> [[RET_SLICE_4]] to i160
; CHECK-NEXT: ret i160 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1463,7 +1678,11 @@ define i160 @load_i160(ptr addrspace(8) %buf) {
define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_i160(
; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i160 [[DATA]] to <5 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <5 x i32> [[DATA_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = extractelement <5 x i32> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1474,7 +1693,13 @@ define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
define i256 @load_i256(ptr addrspace(8) %buf) {
; CHECK-LABEL: define i256 @load_i256(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to i256
; CHECK-NEXT: ret i256 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1485,7 +1710,11 @@ define i256 @load_i256(ptr addrspace(8) %buf) {
define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_i256(
; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast i256 [[DATA]] to <8 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1498,7 +1727,8 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
define i7 @load_i4(ptr addrspace(8) %buf) {
; CHECK-LABEL: define i7 @load_i4(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
; CHECK-NEXT: ret i7 [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1509,7 +1739,8 @@ define i7 @load_i4(ptr addrspace(8) %buf) {
define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_i4(
; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1522,7 +1753,8 @@ define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <2 x i4> @load_v2i4(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <2 x i4>
; CHECK-NEXT: ret <2 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1533,7 +1765,8 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v2i4(
; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <2 x i4> [[DATA]] to i8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1544,7 +1777,8 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <4 x i4> @load_v4i4(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
; CHECK-NEXT: ret <4 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1555,7 +1789,8 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v4i4(
; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1566,7 +1801,8 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <8 x i4> @load_v8i4(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
; CHECK-NEXT: ret <8 x i4> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1577,7 +1813,8 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v8i4(
; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1590,7 +1827,9 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <2 x i6> @load_v2i6(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_TRUNC:%.*]] = trunc i16 [[RET_LOADABLE]] to i12
+; CHECK-NEXT: [[RET:%.*]] = bitcast i12 [[RET_TRUNC]] to <2 x i6>
; CHECK-NEXT: ret <2 x i6> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1601,7 +1840,9 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v2i6(
; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_AS_SCALAR:%.*]] = bitcast <2 x i6> [[DATA]] to i12
+; CHECK-NEXT: [[DATA_ZEXT:%.*]] = zext i12 [[DATA_AS_SCALAR]] to i16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_ZEXT]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1613,7 +1854,13 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x i32> @load_v32i6(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT: [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT: [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT: [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
; CHECK-NEXT: [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
; CHECK-NEXT: ret <6 x i32> [[RET_CAST]]
;
@@ -1627,7 +1874,11 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @store_v32i6(
; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
%data = bitcast <6 x i32> %data.abi to <32 x i6>
@@ -1641,7 +1892,8 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
; CHECK-NEXT: ret <4 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1652,7 +1904,8 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @volatile_store_v4i8(
; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1663,7 +1916,12 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT: [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT: [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
; CHECK-NEXT: ret <6 x i8> [[RET]]
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1674,7 +1932,11 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
; CHECK-LABEL: define void @volatile_store_v6i8(
; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT: [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 90fc3cf3d72ea3..4b47380e7cf145 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -54,7 +54,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]]
; CHECK-NEXT: [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[BUF_PTR_4_PTR_INT]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_CAST:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], i64 4, !dbg [[DBG33]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
; CHECK-NEXT: ret float [[RET]], !dbg [[DBG34:![0-9]+]]
;
%buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20
More information about the llvm-commits
mailing list