[llvm] Reapply "[AMDGPU] Handle natively unsupported types in addrspace(7) lowering" (PR #123660)

Krzysztof Drewniak via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 20 10:28:50 PST 2025


https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/123660

(#123657)

This reverts commit 64749fb01538fba2b56d9850497d5f3a626cabc2.

Adds a constructor to VecSlice to address the failure

>From cc55420ce7b29f9f8e7c2d02f300bd4d71f59421 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 20 Jan 2025 18:26:32 +0000
Subject: [PATCH] Reapply "[AMDGPU] Handle natively unsupported types in
 addrspace(7) lowering" (#123657)

This reverts commit 64749fb01538fba2b56d9850497d5f3a626cabc2.

Adds a constructor to VecSlice to address the failure
---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |  567 ++-
 ...ffer-fat-pointers-contents-legalization.ll | 3998 +++++++++++++++++
 ...mdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll |   11 +
 .../AMDGPU/lower-buffer-fat-pointers-calls.ll |    7 +-
 ...ffer-fat-pointers-contents-legalization.ll | 1298 ++++--
 ...fer-fat-pointers-unoptimized-debug-data.ll |    7 +-
 6 files changed, 5497 insertions(+), 391 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 657a406e9f7056..ccb874e6a934e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -66,6 +66,28 @@
 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the
 // hardware does not include a 160-bit atomic.
 //
+// ## Buffer contents type legalization
+//
+// The underlying buffer intrinsics only support types up to 128 bits long,
+// and don't support complex types. If buffer operations were
+// standard pointer operations that could be represented as MIR-level loads,
+// this would be handled by the various legalization schemes in instruction
+// selection. However, because we have to do the conversion from `load` and
+// `store` to intrinsics at LLVM IR level, we must perform that legalization
+// ourselves.
+//
+// This involves a combination of
+// - Converting arrays to vectors where possible
+// - Otherwise, splitting loads and stores of aggregates into loads/stores of
+//   each component.
+// - Zero-extending things to fill a whole number of bytes
+// - Casting values of types that don't neatly correspond to supported machine
+// value
+//   (for example, an i96 or i256) into ones that would work (
+//    like <3 x i32> and <8 x i32>, respectively)
+// - Splitting values that are too long (such as aforementioned <8 x i32>) into
+//   multiple operations.
+//
 // ## Type remapping
 //
 // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
@@ -86,7 +108,6 @@
 // This phase also records intrinsics so that they can be remangled or deleted
 // later.
 //
-//
 // ## Splitting pointer structs
 //
 // The meat of this pass consists of defining semantics for operations that
@@ -218,6 +239,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -551,7 +573,6 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
   auto *NLI = cast<LoadInst>(LI.clone());
   NLI->mutateType(IntTy);
   NLI = IRB.Insert(NLI);
-  copyMetadataForLoad(*NLI, LI);
   NLI->takeName(&LI);
 
   Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
@@ -576,6 +597,542 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
   return true;
 }
 
+namespace {
+/// Convert loads/stores of types that the buffer intrinsics can't handle into
+/// one ore more such loads/stores that consist of legal types.
+///
+/// Do this by
+/// 1. Recursing into structs (and arrays that don't share a memory layout with
+/// vectors) since the intrinsics can't handle complex types.
+/// 2. Converting arrays of non-aggregate, byte-sized types into their
+/// corresponding vectors
+/// 3. Bitcasting unsupported types, namely overly-long scalars and byte
+/// vectors, into vectors of supported types.
+/// 4. Splitting up excessively long reads/writes into multiple operations.
+///
+/// Note that this doesn't handle complex data strucures, but, in the future,
+/// the aggregate load splitter from SROA could be refactored to allow for that
+/// case.
+class LegalizeBufferContentTypesVisitor
+    : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
+  friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
+
+  IRBuilder<> IRB;
+
+  const DataLayout &DL;
+
+  /// If T is [N x U], where U is a scalar type, return the vector type
+  /// <N x U>, otherwise, return T.
+  Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
+  Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name);
+  Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name);
+
+  /// Break up the loads of a struct into the loads of its components
+
+  /// Convert a vector or scalar type that can't be operated on by buffer
+  /// intrinsics to one that would be legal through bitcasts and/or truncation.
+  /// Uses the wider of i32, i16, or i8 where possible.
+  Type *legalNonAggregateFor(Type *T);
+  Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
+  Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
+
+  struct VecSlice {
+    uint64_t Index = 0;
+    uint64_t Length = 0;
+    VecSlice() = delete;
+    // Needed for some Clangs
+    VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {}
+  };
+  /// Return the [index, length] pairs into which `T` needs to be cut to form
+  /// legal buffer load or store operations. Clears `Slices`. Creates an empty
+  /// `Slices` for non-vector inputs and creates one slice if no slicing will be
+  /// needed.
+  void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
+
+  Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
+  Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
+
+  /// In most cases, return `LegalType`. However, when given an input that would
+  /// normally be a legal type for the buffer intrinsics to return but that
+  /// isn't hooked up through SelectionDAG, return a type of the same width that
+  /// can be used with the relevant intrinsics. Specifically, handle the cases:
+  /// - <1 x T> => T for all T
+  /// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+  /// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+  /// i32>
+  Type *intrinsicTypeFor(Type *LegalType);
+
+  bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
+                     SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
+                     Value *&Result, const Twine &Name);
+  /// Return value is (Changed, ModifiedInPlace)
+  std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
+                                       SmallVectorImpl<uint32_t> &AggIdxs,
+                                       uint64_t AggByteOffset,
+                                       const Twine &Name);
+
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+
+public:
+  LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
+      : IRB(Ctx), DL(DL) {}
+  bool processFunction(Function &F);
+};
+} // namespace
+
+Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
+  ArrayType *AT = dyn_cast<ArrayType>(T);
+  if (!AT)
+    return T;
+  Type *ET = AT->getElementType();
+  if (!ET->isSingleValueType() || isa<VectorType>(ET))
+    report_fatal_error("loading non-scalar arrays from buffer fat pointers "
+                       "should have recursed");
+  if (!DL.typeSizeEqualsStoreSize(AT))
+    report_fatal_error(
+        "loading padded arrays from buffer fat pinters should have recursed");
+  return FixedVectorType::get(ET, AT->getNumElements());
+}
+
+Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
+                                                        Type *TargetType,
+                                                        const Twine &Name) {
+  Value *VectorRes = PoisonValue::get(TargetType);
+  auto *VT = cast<FixedVectorType>(TargetType);
+  unsigned EC = VT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
+    VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
+                                        Name + ".as.vec." + Twine(I));
+  }
+  return VectorRes;
+}
+
+Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
+                                                        Type *OrigType,
+                                                        const Twine &Name) {
+  Value *ArrayRes = PoisonValue::get(OrigType);
+  ArrayType *AT = cast<ArrayType>(OrigType);
+  unsigned EC = AT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
+    ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
+                                     Name + ".as.array." + Twine(I));
+  }
+  return ArrayRes;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+  TypeSize Size = DL.getTypeStoreSizeInBits(T);
+  // Implicitly zero-extend to the next byte if needed
+  if (!DL.typeSizeEqualsStoreSize(T))
+    T = IRB.getIntNTy(Size.getFixedValue());
+  Type *ElemTy = T->getScalarType();
+  if (isa<PointerType, ScalableVectorType>(ElemTy)) {
+    // Pointers are always big enough, and we'll let scalable vectors through to
+    // fail in codegen.
+    return T;
+  }
+  unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
+  if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+    // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
+    // legal buffer operations.
+    return T;
+  }
+  Type *BestVectorElemType = nullptr;
+  if (Size.isKnownMultipleOf(32))
+    BestVectorElemType = IRB.getInt32Ty();
+  else if (Size.isKnownMultipleOf(16))
+    BestVectorElemType = IRB.getInt16Ty();
+  else
+    BestVectorElemType = IRB.getInt8Ty();
+  unsigned NumCastElems =
+      Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
+  if (NumCastElems == 1)
+    return BestVectorElemType;
+  return FixedVectorType::get(BestVectorElemType, NumCastElems);
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
+    Value *V, Type *TargetType, const Twine &Name) {
+  Type *SourceType = V->getType();
+  TypeSize SourceSize = DL.getTypeSizeInBits(SourceType);
+  TypeSize TargetSize = DL.getTypeSizeInBits(TargetType);
+  if (SourceSize != TargetSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
+    Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
+    V = Zext;
+    SourceType = ByteScalarTy;
+  }
+  return IRB.CreateBitCast(V, TargetType, Name + ".legal");
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
+    Value *V, Type *OrigType, const Twine &Name) {
+  Type *LegalType = V->getType();
+  TypeSize LegalSize = DL.getTypeSizeInBits(LegalType);
+  TypeSize OrigSize = DL.getTypeSizeInBits(OrigType);
+  if (LegalSize != OrigSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
+    Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
+    return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
+  }
+  return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
+}
+
+Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
+  auto *VT = dyn_cast<FixedVectorType>(LegalType);
+  if (!VT)
+    return LegalType;
+  Type *ET = VT->getElementType();
+  // Explicitly return the element type of 1-element vectors because the
+  // underlying intrinsics don't like <1 x T> even though it's a synonym for T.
+  if (VT->getNumElements() == 1)
+    return ET;
+  if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
+    return FixedVectorType::get(IRB.getInt32Ty(), 3);
+  if (ET->isIntegerTy(8)) {
+    switch (VT->getNumElements()) {
+    default:
+      return LegalType; // Let it crash later
+    case 1:
+      return IRB.getInt8Ty();
+    case 2:
+      return IRB.getInt16Ty();
+    case 4:
+      return IRB.getInt32Ty();
+    case 8:
+      return FixedVectorType::get(IRB.getInt32Ty(), 2);
+    case 16:
+      return FixedVectorType::get(IRB.getInt32Ty(), 4);
+    }
+  }
+  return LegalType;
+}
+
+void LegalizeBufferContentTypesVisitor::getVecSlices(
+    Type *T, SmallVectorImpl<VecSlice> &Slices) {
+  Slices.clear();
+  auto *VT = dyn_cast<FixedVectorType>(T);
+  if (!VT)
+    return;
+
+  uint64_t ElemBitWidth =
+      DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
+
+  uint64_t ElemsPer4Words = 128 / ElemBitWidth;
+  uint64_t ElemsPer2Words = ElemsPer4Words / 2;
+  uint64_t ElemsPerWord = ElemsPer2Words / 2;
+  uint64_t ElemsPerShort = ElemsPerWord / 2;
+  uint64_t ElemsPerByte = ElemsPerShort / 2;
+  // If the elements evenly pack into 32-bit words, we can use 3-word stores,
+  // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
+  // example, <3 x i64>, since that's not slicing.
+  uint64_t ElemsPer3Words = ElemsPerWord * 3;
+
+  uint64_t TotalElems = VT->getNumElements();
+  uint64_t Index = 0;
+  auto TrySlice = [&](unsigned MaybeLen) {
+    if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+      VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
+      Slices.push_back(Slice);
+      Index += MaybeLen;
+      return true;
+    }
+    return false;
+  };
+  while (Index < TotalElems) {
+    TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
+        TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
+        TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+  }
+}
+
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
+                                                       const Twine &Name) {
+  auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType());
+  if (!VecVT)
+    return Vec;
+  if (S.Length == VecVT->getNumElements() && S.Index == 0)
+    return Vec;
+  if (S.Length == 1)
+    return IRB.CreateExtractElement(Vec, S.Index,
+                                    Name + ".slice." + Twine(S.Index));
+  SmallVector<int> Mask = llvm::to_vector(
+      llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false));
+  return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index));
+}
+
+Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
+                                                      VecSlice S,
+                                                      const Twine &Name) {
+  auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType());
+  if (!WholeVT)
+    return Part;
+  if (S.Length == WholeVT->getNumElements() && S.Index == 0)
+    return Part;
+  if (S.Length == 1) {
+    return IRB.CreateInsertElement(Whole, Part, S.Index,
+                                   Name + ".slice." + Twine(S.Index));
+  }
+  int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
+
+  // Extend the slice with poisons to make the main shufflevector happy.
+  SmallVector<int> ExtPartMask(NumElems, -1);
+  for (auto [I, E] : llvm::enumerate(
+           MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
+    E = I;
+  }
+  Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
+                                           Name + ".ext." + Twine(S.Index));
+
+  SmallVector<int> Mask =
+      llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
+  for (auto [I, E] :
+       llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length)))
+    E = I + NumElems;
+  return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
+                                 Name + ".parts." + Twine(S.Index));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
+    LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, Value *&Result, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                               AggByteOff + Offset.getFixedValue(), Result,
+                               Name + "." + Twine(I));
+      AggIdxs.pop_back();
+    }
+    return Changed;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                                 AggByteOff + I * ElemStoreSize.getFixedValue(),
+                                 Result, Name + Twine(I));
+        AggIdxs.pop_back();
+      }
+      return Changed;
+    }
+  }
+
+  // Typical case
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool HasSlices = Slices.size() > 1;
+  bool IsAggPart = !AggIdxs.empty();
+  Value *LoadsRes;
+  if (!HasSlices && !IsAggPart) {
+    Type *LoadableType = intrinsicTypeFor(LegalType);
+    if (LoadableType == PartType)
+      return false;
+
+    IRB.SetInsertPoint(&OrigLI);
+    auto *NLI = cast<LoadInst>(OrigLI.clone());
+    NLI->mutateType(LoadableType);
+    NLI = IRB.Insert(NLI);
+    NLI->setName(Name + ".loadable");
+
+    LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable");
+  } else {
+    IRB.SetInsertPoint(&OrigLI);
+    LoadsRes = PoisonValue::get(LegalType);
+    Value *OrigPtr = OrigLI.getPointerOperand();
+    // If we're needing to spill something into more than one load, its legal
+    // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
+    // But if we're already a scalar (which can happen if we're splitting up a
+    // struct), the element type will be the legal type itself.
+    Type *ElemType = LegalType->getScalarType();
+    unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+    AAMDNodes AANodes = OrigLI.getAAMetadata();
+    if (IsAggPart && Slices.empty())
+      Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+    for (VecSlice S : Slices) {
+      Type *SliceType =
+          S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+      int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+      // You can't reasonably expect loads to wrap around the edge of memory.
+      Value *NewPtr = IRB.CreateGEP(
+          IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
+          OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
+          GEPNoWrapFlags::noUnsignedWrap());
+      Type *LoadableType = intrinsicTypeFor(SliceType);
+      LoadInst *NewLI = IRB.CreateAlignedLoad(
+          LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
+          Name + ".off." + Twine(ByteOffset));
+      copyMetadataForLoad(*NewLI, OrigLI);
+      NewLI->setAAMetadata(
+          AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
+      NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
+      NewLI->setVolatile(OrigLI.isVolatile());
+      Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
+                                        NewLI->getName() + ".from.loadable");
+      LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
+    }
+  }
+  if (LegalType != ArrayAsVecType)
+    LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
+  if (ArrayAsVecType != PartType)
+    LoadsRes = vectorToArray(LoadsRes, PartType, Name);
+
+  if (IsAggPart)
+    Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name);
+  else
+    Result = LoadsRes;
+  return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+  if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+
+  SmallVector<uint32_t> AggIdxs;
+  Type *OrigType = LI.getType();
+  Value *Result = PoisonValue::get(OrigType);
+  bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName());
+  if (!Changed)
+    return false;
+  Result->takeName(&LI);
+  LI.replaceAllUsesWith(Result);
+  LI.eraseFromParent();
+  return Changed;
+}
+
+std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
+    StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs,
+                                            AggByteOff + Offset.getFixedValue(),
+                                            Name + "." + Twine(I)));
+      AggIdxs.pop_back();
+    }
+    return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= std::get<0>(visitStoreImpl(
+            OrigSI, ElemTy, AggIdxs,
+            AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I)));
+        AggIdxs.pop_back();
+      }
+      return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+    }
+  }
+
+  Value *OrigData = OrigSI.getValueOperand();
+  Value *NewData = OrigData;
+
+  bool IsAggPart = !AggIdxs.empty();
+  if (IsAggPart)
+    NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name);
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+  if (ArrayAsVecType != PartType) {
+    NewData = arrayToVector(NewData, ArrayAsVecType, Name);
+  }
+
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+  if (LegalType != ArrayAsVecType) {
+    NewData = makeLegalNonAggregate(NewData, LegalType, Name);
+  }
+
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool NeedToSplit = Slices.size() > 1 || IsAggPart;
+  if (!NeedToSplit) {
+    Type *StorableType = intrinsicTypeFor(LegalType);
+    if (StorableType == PartType)
+      return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false);
+    NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+    OrigSI.setOperand(0, NewData);
+    return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true);
+  }
+
+  Value *OrigPtr = OrigSI.getPointerOperand();
+  Type *ElemType = LegalType->getScalarType();
+  if (IsAggPart && Slices.empty())
+    Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
+  unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+  AAMDNodes AANodes = OrigSI.getAAMetadata();
+  for (VecSlice S : Slices) {
+    Type *SliceType =
+        S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+    int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
+    Value *NewPtr =
+        IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+                      OrigPtr->getName() + ".part." + Twine(S.Index),
+                      GEPNoWrapFlags::noUnsignedWrap());
+    Value *DataSlice = extractSlice(NewData, S, Name);
+    Type *StorableType = intrinsicTypeFor(SliceType);
+    DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+                                  DataSlice->getName() + ".storable");
+    auto *NewSI = cast<StoreInst>(OrigSI.clone());
+    NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset));
+    IRB.Insert(NewSI);
+    NewSI->setOperand(0, DataSlice);
+    NewSI->setOperand(1, NewPtr);
+    NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
+  }
+  return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false);
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+  if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  IRB.SetInsertPoint(&SI);
+  SmallVector<uint32_t> AggIdxs;
+  Value *OrigData = SI.getValueOperand();
+  auto [Changed, ModifiedInPlace] =
+      visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName());
+  if (Changed && !ModifiedInPlace)
+    SI.eraseFromParent();
+  return Changed;
+}
+
+bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+  bool Changed = false;
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    Changed |= visit(I);
+  }
+  return Changed;
+}
+
 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
 /// buffer fat pointer constant.
 static std::pair<Constant *, Constant *>
@@ -1766,12 +2323,16 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   }
 
   StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+  LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
+                                                              M.getContext());
   for (Function &F : M.functions()) {
     bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
     bool BodyChanges = containsBufferFatPointers(F, &StructTM);
     Changed |= MemOpsRewrite.processFunction(F);
-    if (InterfaceChange || BodyChanges)
+    if (InterfaceChange || BodyChanges) {
       NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
+      Changed |= BufferContentsTypeRewrite.processFunction(F);
+    }
   }
   if (NeedsRemap.empty())
     return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
new file mode 100644
index 00000000000000..4c7a4ba3a44a5f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -0,0 +1,3998 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
+; the lowering.
+
+;;; Legal types. These are natively supported, no casts should be performed.
+
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i8, ptr addrspace(7) %p
+  ret i8 %ret
+}
+
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i8 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i16, ptr addrspace(7) %p
+  ret i16 %ret
+}
+
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i16 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i32, ptr addrspace(7) %p
+  ret i32 %ret
+}
+
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i32 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i64, ptr addrspace(7) %p
+  ret i64 %ret
+}
+
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i64 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i128:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i128:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i128, ptr addrspace(7) %p
+  ret i128 %ret
+}
+
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i128:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i128:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i128 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i32>, ptr addrspace(7) %p
+  ret <1 x i32> %ret
+}
+
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i32>, ptr addrspace(7) %p
+  ret <2 x i32> %ret
+}
+
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i32>, ptr addrspace(7) %p
+  ret <3 x i32> %ret
+}
+
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i32>, ptr addrspace(7) %p
+  ret <4 x i32> %ret
+}
+
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i16>, ptr addrspace(7) %p
+  ret <2 x i16> %ret
+}
+
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i16>, ptr addrspace(7) %p
+  ret <4 x i16> %ret
+}
+
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i16>, ptr addrspace(7) %p
+  ret <8 x i16> %ret
+}
+
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i64>, ptr addrspace(7) %p
+  ret <2 x i64> %ret
+}
+
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i64> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define half @load_f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load half, ptr addrspace(7) %p
+  ret half %ret
+}
+
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store half %data, ptr addrspace(7) %p
+  ret void
+}
+
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load bfloat, ptr addrspace(7) %p
+  ret bfloat %ret
+}
+
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store bfloat %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x half>, ptr addrspace(7) %p
+  ret <2 x half> %ret
+}
+
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x bfloat>, ptr addrspace(7) %p
+  ret <4 x bfloat> %ret
+}
+
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x bfloat> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x half>, ptr addrspace(7) %p
+  ret <8 x half> %ret
+}
+
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define float @load_f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load float, ptr addrspace(7) %p
+  ret float %ret
+}
+
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store float %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x float>, ptr addrspace(7) %p
+  ret <2 x float> %ret
+}
+
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x float>, ptr addrspace(7) %p
+  ret <3 x float> %ret
+}
+
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x float>, ptr addrspace(7) %p
+  ret <4 x float> %ret
+}
+
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(0), ptr addrspace(7) %p
+  ret ptr addrspace(0) %ret
+}
+
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(0) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(1), ptr addrspace(7) %p
+  ret ptr addrspace(1) %ret
+}
+
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(1) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(2), ptr addrspace(7) %p
+  ret ptr addrspace(2) %ret
+}
+
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(2) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(3), ptr addrspace(7) %p
+  ret ptr addrspace(3) %ret
+}
+
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(3) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(4), ptr addrspace(7) %p
+  ret ptr addrspace(4) %ret
+}
+
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(4) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(5), ptr addrspace(7) %p
+  ret ptr addrspace(5) %ret
+}
+
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(5) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(6), ptr addrspace(7) %p
+  ret ptr addrspace(6) %ret
+}
+
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(6) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(8), ptr addrspace(7) %p
+  ret ptr addrspace(8) %ret
+}
+
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(8) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(1)> %ret
+}
+
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(5)> %ret
+}
+
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <3 x ptr addrspace(5)> %ret
+}
+
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(5)> %ret
+}
+
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
+
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x half>, ptr addrspace(7) %p
+  ret <6 x half> %ret
+}
+
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Long types (32 bit elements). Must be split into multiple operations.
+
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x float>, ptr addrspace(7) %p
+  ret <5 x float> %ret
+}
+
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x float>, ptr addrspace(7) %p
+  ret <6 x float> %ret
+}
+
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x float>, ptr addrspace(7) %p
+  ret <7 x float> %ret
+}
+
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x float>, ptr addrspace(7) %p
+  ret <8 x float> %ret
+}
+
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v10f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v10f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <10 x float>, ptr addrspace(7) %p
+  ret <10 x float> %ret
+}
+
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v10f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v10f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <10 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i32>, ptr addrspace(7) %p
+  ret <6 x i32> %ret
+}
+
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(1)> %ret
+}
+
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
+
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i16>, ptr addrspace(7) %p
+  ret <1 x i16> %ret
+}
+
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i16>, ptr addrspace(7) %p
+  ret <3 x i16> %ret
+}
+
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i16>, ptr addrspace(7) %p
+  ret <5 x i16> %ret
+}
+
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i16>, ptr addrspace(7) %p
+  ret <6 x i16> %ret
+}
+
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i16>, ptr addrspace(7) %p
+  ret <7 x i16> %ret
+}
+
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v3, off, s[16:19], 0 offset:12
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v9i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v9i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT:    v_bfi_b32 v3, s4, v3, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <9 x i16>, ptr addrspace(7) %p
+  ret <9 x i16> %ret
+}
+
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v9i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v9i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <9 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Byte vectors. Need to be
+;;; - Split into multiple operations
+;;; - Bitcast if they have a natively supported width
+
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i8>, ptr addrspace(7) %p
+  ret <1 x i8> %ret
+}
+
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i8>, ptr addrspace(7) %p
+  ret <2 x i8> %ret
+}
+
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i8>, ptr addrspace(7) %p
+  ret <3 x i8> %ret
+}
+
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i8>, ptr addrspace(7) %p
+  ret <5 x i8> %ret
+}
+
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v6, 8
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v5, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT:    s_waitcnt vmcnt(2)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_load_ubyte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i8>, ptr addrspace(7) %p
+  ret <7 x i8> %ret
+}
+
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_store_byte v6, off, s[16:19], 0 offset:6
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v8, 8
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v7, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GISEL-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_store_byte v6, off, s[16:19], 0 offset:6
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i8>, ptr addrspace(7) %p
+  ret <8 x i8> %ret
+}
+
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx2 v[3:4], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v9, 8
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v12i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, v14
+; SDAG-NEXT:    v_mov_b32_e32 v2, v13
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v12i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v12
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <12 x i8>, ptr addrspace(7) %p
+  ret <12 x i8> %ret
+}
+
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v12i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx3 v[6:8], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v12i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v13, 8
+; GISEL-NEXT:    v_mov_b32_e32 v12, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v12, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v12, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <12 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v16i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[18:19], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v17, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v12, v3
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v16
+; SDAG-NEXT:    v_mov_b32_e32 v3, v18
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v16i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v12, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v16
+; GISEL-NEXT:    v_mov_b32_e32 v2, v17
+; GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <16 x i8>, ptr addrspace(7) %p
+  ret <16 x i8> %ret
+}
+
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v16i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[9:12], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v16i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v17, 8
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v3, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT:    v_and_or_b32 v3, v12, v16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <16 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[33:36], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[48:51], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[33:34]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[35:36]
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[19:20], 24, v[48:49]
+; SDAG-NEXT:    v_lshrrev_b64 v[27:28], 24, v[50:51]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v33
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v35
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 8, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 16, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v17, 8, v48
+; SDAG-NEXT:    v_lshrrev_b32_e32 v18, 16, v48
+; SDAG-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v22, 16, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v23, 24, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v25, 8, v50
+; SDAG-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
+; SDAG-NEXT:    v_lshrrev_b32_e32 v29, 8, v51
+; SDAG-NEXT:    v_lshrrev_b32_e32 v30, 16, v51
+; SDAG-NEXT:    v_lshrrev_b32_e32 v31, 24, v51
+; SDAG-NEXT:    v_mov_b32_e32 v0, v33
+; SDAG-NEXT:    v_mov_b32_e32 v4, v34
+; SDAG-NEXT:    v_mov_b32_e32 v8, v35
+; SDAG-NEXT:    v_mov_b32_e32 v12, v36
+; SDAG-NEXT:    v_mov_b32_e32 v16, v48
+; SDAG-NEXT:    v_mov_b32_e32 v20, v49
+; SDAG-NEXT:    v_mov_b32_e32 v24, v50
+; SDAG-NEXT:    v_mov_b32_e32 v28, v51
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
+; GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
+; GISEL-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v12, v3
+; GISEL-NEXT:    v_mov_b32_e32 v20, v17
+; GISEL-NEXT:    v_mov_b32_e32 v24, v18
+; GISEL-NEXT:    v_mov_b32_e32 v28, v19
+; GISEL-NEXT:    v_mov_b32_e32 v1, v35
+; GISEL-NEXT:    v_mov_b32_e32 v2, v36
+; GISEL-NEXT:    v_mov_b32_e32 v3, v37
+; GISEL-NEXT:    v_mov_b32_e32 v17, v32
+; GISEL-NEXT:    v_mov_b32_e32 v18, v33
+; GISEL-NEXT:    v_mov_b32_e32 v19, v34
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i8>, ptr addrspace(7) %p
+  ret <32 x i8> %ret
+}
+
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
+; SDAG-NEXT:    v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
+; SDAG-NEXT:    v_lshlrev_b16_e32 v2, 8, v23
+; SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v17
+; SDAG-NEXT:    v_lshlrev_b16_e32 v15, 8, v19
+; SDAG-NEXT:    v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
+; SDAG-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v31, 8
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_mov_b32_e32 v32, 0xff
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v32, v1
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GISEL-NEXT:    buffer_load_ubyte v7, off, s[0:3], s32
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v32, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v4, v5
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v32, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT:    v_and_or_b32 v3, v12, v32, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v18
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v19
+; GISEL-NEXT:    v_and_or_b32 v4, v16, v32, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v22
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v23
+; GISEL-NEXT:    v_and_or_b32 v8, v20, v32, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT:    v_or3_b32 v5, v8, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v27
+; GISEL-NEXT:    v_and_or_b32 v6, v24, v32, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GISEL-NEXT:    v_or3_b32 v6, v6, v8, v9
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GISEL-NEXT:    v_and_or_b32 v8, v28, v32, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GISEL-NEXT:    v_or3_b32 v7, v8, v9, v7
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Arrays. Need to become vectors.
+
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [1 x i32], ptr addrspace(7) %p
+  ret [1 x i32] %ret
+}
+
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [1 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x i32], ptr addrspace(7) %p
+  ret [2 x i32] %ret
+}
+
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x half], ptr addrspace(7) %p
+  ret [2 x half] %ret
+}
+
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x half] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
+  ret [2 x ptr addrspace(1)] %ret
+}
+
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Scalars of atypical width. Need to be cast to vectors and split.
+
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i40:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i40:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i40, ptr addrspace(7) %p
+  ret i40 %ret
+}
+
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i40:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v1, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i40:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i40 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i96:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i96:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i96, ptr addrspace(7) %p
+  ret i96 %ret
+}
+
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i96:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i96:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i96 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i160:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_mov_b32 s4, s33
+; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_addk_i32 s32, 0x1800
+; SDAG-NEXT:    s_addk_i32 s32, 0xe800
+; SDAG-NEXT:    s_mov_b32 s33, s4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i160:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i160, ptr addrspace(7) %p
+  ret i160 %ret
+}
+
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i160:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, s33
+; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_addk_i32 s32, 0x1000
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_addk_i32 s32, 0xf000
+; SDAG-NEXT:    s_mov_b32 s33, s4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i160:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i160 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i256:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i256:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i256, ptr addrspace(7) %p
+  ret i256 %ret
+}
+
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i256:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i256:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i256 %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Non-byte-sized scalars. Require zero-extension.
+
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i7, ptr addrspace(7) %p
+  ret i7 %ret
+}
+
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i7 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i4, ptr addrspace(7) %p
+  ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i4 %data, ptr addrspace(7) %p
+  ret void
+}
+
+
+;;; Byte-sized vectors of i4. Require casts.
+
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i4>, ptr addrspace(7) %p
+  ret <2 x i4> %ret
+}
+
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b16_e32 v4, 4, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT:    v_lshrrev_b16_e32 v3, 12, v1
+; SDAG-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i4>, ptr addrspace(7) %p
+  ret <4 x i4> %ret
+}
+
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 15
+; SDAG-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 12, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v7, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v7
+; SDAG-NEXT:    v_bfe_u32 v1, v7, 4, 4
+; SDAG-NEXT:    v_bfe_u32 v2, v7, 8, 4
+; SDAG-NEXT:    v_bfe_u32 v3, v7, 12, 4
+; SDAG-NEXT:    v_bfe_u32 v4, v7, 16, 4
+; SDAG-NEXT:    v_bfe_u32 v5, v7, 20, 4
+; SDAG-NEXT:    v_bfe_u32 v6, v7, 24, 4
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 28, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 20, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i4>, ptr addrspace(7) %p
+  ret <8 x i4> %ret
+}
+
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; SDAG-NEXT:    v_and_or_b32 v0, v0, 15, v1
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v3
+; SDAG-NEXT:    v_and_b32_e32 v2, 15, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v5
+; SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 20, v1
+; SDAG-NEXT:    v_and_b32_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    v_or3_b32 v0, v0, v3, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 28, v7
+; SDAG-NEXT:    v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GISEL-NEXT:    v_and_or_b32 v0, v0, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 15, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_and_b32_e32 v3, 15, v5
+; GISEL-NEXT:    v_and_b32_sdwa v2, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
+; GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT:    v_and_b32_e32 v2, 15, v7
+; GISEL-NEXT:    v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 28, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Vectors of non-byte-sized integers.
+
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 63, v1
+; SDAG-NEXT:    v_bfe_u32 v1, v1, 6, 6
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b16_e32 v1, 6, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i6>, ptr addrspace(7) %p
+  ret <2 x i6> %ret
+}
+
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 63, v0
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 63, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 63, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;; Blocks of fp6 elements
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i6>, ptr addrspace(7) %p
+  %ret.cast = bitcast <32 x i6> %ret to <6 x i32>
+  ret <6 x i32> %ret.cast
+}
+
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %data = bitcast <6 x i32> %data.abi to <32 x i6>
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Modifiers
+
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+  ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+  ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i32 }, ptr addrspace(7) %p
+  ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_sl_f32ss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_sl_f32ss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { { float } }, ptr addrspace(7) %p
+  ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_sl_f32ss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_sl_f32ss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { { float } } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_v2i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_v2i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { <2 x i32> }, ptr addrspace(7) %p
+  ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_v2i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_v2i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { <2 x i32> } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i64i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i64i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i64, i32 }, ptr addrspace(7) %p
+  ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i64i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i64i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i64, i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a4i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT:    buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a4i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT:    buffer_load_ubyte v3, off, s[16:19], 0 offset:3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [4 x i7], ptr addrspace(7) %p
+  ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a4i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v1
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:2
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v3
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:3
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a4i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v1
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v3
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [4 x i7] %data, ptr addrspace(7) %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
new file mode 100644
index 00000000000000..a91d38a58a1e9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
@@ -0,0 +1,11 @@
+; Note: The exact error messages aren't important here, but are included to catch
+; anything changing.
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900  -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=SDAG
+; SDAG: LLVM ERROR: Scalarization of scalable vectors is not supported.
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900  -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=GISEL
+; GISEL: LLVM ERROR: Invalid size request on a scalable vector.
+
+define void @buffer_store_nxv2i32(ptr addrspace(8) inreg %rsrc, i32 %offset) {
+  call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> poison, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 6f0d51a0277380..022094bc633c88 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -91,7 +91,12 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32
 ; CHECK-NEXT:    [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160
 ; CHECK-NEXT:    [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[V_INT]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT:    [[V_INT_LEGAL:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
+; CHECK-NEXT:    [[V_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
+; CHECK-NEXT:    [[V_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index 5b225636b120a4..d18f0f8bd1ff93 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1,13 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
 
+; Note: if you're adding tests here, also add them to
+; buffer-fat-pointers-contents-legalization.ll to make sure the output of this
+; transformation can codegen.
+
 target triple = "amdgcn--"
 
 ;;; Legal types. These are natively supported, no casts should be performed.
 
-define i8 @load_i8(ptr addrspace(8) %buf) {
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i8 @load_i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i8 [[RET]]
 ;
@@ -16,9 +20,9 @@ define i8 @load_i8(ptr addrspace(8) %buf) {
   ret i8 %ret
 }
 
-define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i8(
-; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -27,9 +31,9 @@ define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i16 @load_i16(ptr addrspace(8) %buf) {
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i16 @load_i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i16 [[RET]]
 ;
@@ -38,9 +42,9 @@ define i16 @load_i16(ptr addrspace(8) %buf) {
   ret i16 %ret
 }
 
-define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i16(
-; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -49,9 +53,9 @@ define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i32 @load_i32(ptr addrspace(8) %buf) {
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i32 @load_i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i32 [[RET]]
 ;
@@ -60,9 +64,9 @@ define i32 @load_i32(ptr addrspace(8) %buf) {
   ret i32 %ret
 }
 
-define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i32(
-; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -71,9 +75,9 @@ define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i64 @load_i64(ptr addrspace(8) %buf) {
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i64 @load_i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
@@ -82,9 +86,9 @@ define i64 @load_i64(ptr addrspace(8) %buf) {
   ret i64 %ret
 }
 
-define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i64(
-; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -93,9 +97,9 @@ define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i128 @load_i128(ptr addrspace(8) %buf) {
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i128 @load_i128(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i128 @llvm.amdgcn.raw.ptr.buffer.load.i128(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i128 [[RET]]
 ;
@@ -104,9 +108,9 @@ define i128 @load_i128(ptr addrspace(8) %buf) {
   ret i128 %ret
 }
 
-define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i128(
-; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i128(i128 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -115,10 +119,11 @@ define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i32> @load_v1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
 ; CHECK-NEXT:    ret <1 x i32> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -126,10 +131,11 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
   ret <1 x i32> %ret
 }
 
-define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i32(
-; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -137,9 +143,9 @@ define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i32> @load_v2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i32> [[RET]]
 ;
@@ -148,9 +154,9 @@ define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
   ret <2 x i32> %ret
 }
 
-define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i32(
-; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -159,9 +165,9 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i32> @load_v3i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
@@ -170,9 +176,9 @@ define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
   ret <3 x i32> %ret
 }
 
-define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i32(
-; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -181,9 +187,9 @@ define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i32> @load_v4i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
@@ -192,9 +198,9 @@ define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
   ret <4 x i32> %ret
 }
 
-define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i32(
-; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -203,9 +209,9 @@ define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i16> @load_v2i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i16> [[RET]]
 ;
@@ -214,9 +220,9 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
   ret <2 x i16> %ret
 }
 
-define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i16(
-; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -225,9 +231,9 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i16> @load_v4i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x i16> [[RET]]
 ;
@@ -236,9 +242,9 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
   ret <4 x i16> %ret
 }
 
-define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i16(
-; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -247,9 +253,9 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i16> @load_v8i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x i16> [[RET]]
 ;
@@ -258,9 +264,9 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
   ret <8 x i16> %ret
 }
 
-define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i16(
-; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -269,9 +275,9 @@ define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i64> @load_v2i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i64> [[RET]]
 ;
@@ -280,9 +286,9 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
   ret <2 x i64> %ret
 }
 
-define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i64(
-; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -291,9 +297,9 @@ define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define half @load_f16(ptr addrspace(8) %buf) {
+define half @load_f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define half @load_f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[RET]]
 ;
@@ -302,9 +308,9 @@ define half @load_f16(ptr addrspace(8) %buf) {
   ret half %ret
 }
 
-define void @store_f16(half %data, ptr addrspace(8) %buf) {
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_f16(
-; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -313,9 +319,9 @@ define void @store_f16(half %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define bfloat @load_bf16(ptr addrspace(8) %buf) {
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define bfloat @load_bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret bfloat [[RET]]
 ;
@@ -324,9 +330,9 @@ define bfloat @load_bf16(ptr addrspace(8) %buf) {
   ret bfloat %ret
 }
 
-define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_bf16(
-; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -335,9 +341,9 @@ define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x half> @load_v2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x half> [[RET]]
 ;
@@ -346,9 +352,9 @@ define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
   ret <2 x half> %ret
 }
 
-define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2f16(
-; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -357,9 +363,9 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x bfloat> [[RET]]
 ;
@@ -368,9 +374,9 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
   ret <4 x bfloat> %ret
 }
 
-define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4bf16(
-; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -379,9 +385,9 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x half> @load_v8f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x half> [[RET]]
 ;
@@ -390,9 +396,9 @@ define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
   ret <8 x half> %ret
 }
 
-define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8f16(
-; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -401,9 +407,9 @@ define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define float @load_f32(ptr addrspace(8) %buf) {
+define float @load_f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define float @load_f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[RET]]
 ;
@@ -412,9 +418,9 @@ define float @load_f32(ptr addrspace(8) %buf) {
   ret float %ret
 }
 
-define void @store_f32(float %data, ptr addrspace(8) %buf) {
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_f32(
-; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -423,9 +429,9 @@ define void @store_f32(float %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x float> @load_v2f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
@@ -434,9 +440,9 @@ define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
   ret <2 x float> %ret
 }
 
-define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2f32(
-; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -445,9 +451,9 @@ define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x float> @load_v3f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[RET]]
 ;
@@ -456,9 +462,9 @@ define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
   ret <3 x float> %ret
 }
 
-define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3f32(
-; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -467,9 +473,9 @@ define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x float> @load_v4f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x float> [[RET]]
 ;
@@ -478,9 +484,9 @@ define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
   ret <4 x float> %ret
 }
 
-define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4f32(
-; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -489,9 +495,9 @@ define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr @load_p0(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr [[RET]]
 ;
@@ -500,9 +506,9 @@ define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
   ret ptr addrspace(0) %ret
 }
 
-define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p0(
-; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -511,9 +517,9 @@ define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(1) @load_p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[RET]]
 ;
@@ -522,9 +528,9 @@ define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
   ret ptr addrspace(1) %ret
 }
 
-define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p1(
-; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -533,9 +539,9 @@ define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(2) @load_p2(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(2) [[RET]]
 ;
@@ -544,9 +550,9 @@ define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
   ret ptr addrspace(2) %ret
 }
 
-define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p2(
-; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p2(ptr addrspace(2) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -555,9 +561,9 @@ define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(3) @load_p3(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(3) [[RET]]
 ;
@@ -566,9 +572,9 @@ define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
   ret ptr addrspace(3) %ret
 }
 
-define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p3(
-; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p3(ptr addrspace(3) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -577,9 +583,9 @@ define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(4) @load_p4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(4) [[RET]]
 ;
@@ -588,9 +594,9 @@ define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
   ret ptr addrspace(4) %ret
 }
 
-define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p4(
-; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -599,9 +605,9 @@ define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(5) @load_p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(5) [[RET]]
 ;
@@ -610,9 +616,9 @@ define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
   ret ptr addrspace(5) %ret
 }
 
-define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p5(
-; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -621,9 +627,9 @@ define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(6) @load_p6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(6) @llvm.amdgcn.raw.ptr.buffer.load.p6(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(6) [[RET]]
 ;
@@ -632,9 +638,9 @@ define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
   ret ptr addrspace(6) %ret
 }
 
-define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p6(
-; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p6(ptr addrspace(6) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -643,9 +649,9 @@ define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(8) @load_p8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.raw.ptr.buffer.load.p8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(8) [[RET]]
 ;
@@ -654,9 +660,9 @@ define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
   ret ptr addrspace(8) %ret
 }
 
-define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p8(
-; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p8(ptr addrspace(8) [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -665,9 +671,9 @@ define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x ptr addrspace(1)> @load_v2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[RET]]
 ;
@@ -676,9 +682,9 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
   ret <2 x ptr addrspace(1)> %ret
 }
 
-define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2p1(
-; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -687,9 +693,9 @@ define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x ptr addrspace(5)> @load_v2p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v2p5(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x ptr addrspace(5)> [[RET]]
 ;
@@ -698,9 +704,9 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
   ret <2 x ptr addrspace(5)> %ret
 }
 
-define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2p5(
-; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p5(<2 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -709,9 +715,9 @@ define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x ptr addrspace(5)> @load_v3p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x ptr addrspace(5)> [[RET]]
 ;
@@ -720,9 +726,9 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
   ret <3 x ptr addrspace(5)> %ret
 }
 
-define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3p5(
-; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -731,9 +737,9 @@ define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x ptr addrspace(5)> @load_v4p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x ptr addrspace(5)> [[RET]]
 ;
@@ -742,9 +748,9 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
   ret <4 x ptr addrspace(5)> %ret
 }
 
-define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4p5(
-; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -755,10 +761,11 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
 
 ;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
 
-define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x half> @load_v6f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
 ; CHECK-NEXT:    ret <6 x half> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -766,10 +773,11 @@ define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
   ret <6 x half> %ret
 }
 
-define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6f16(
-; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -779,10 +787,14 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
 
 ;;; Long types (32 bit elements). Must be split into multiple operations.
 
-define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x float> @load_v5f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x float> poison, <5 x float> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_OFF_16]], i64 4
 ; CHECK-NEXT:    ret <5 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -790,10 +802,13 @@ define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
   ret <5 x float> %ret
 }
 
-define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5f32(
-; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x float> [[DATA]], <5 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x float> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -801,10 +816,15 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x float> @load_v6f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x float> poison, <6 x float> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_OFF_16]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x float> [[RET_PARTS_0]], <6 x float> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -812,10 +832,13 @@ define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
   ret <6 x float> %ret
 }
 
-define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6f32(
-; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -823,10 +846,15 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x float> @load_v7f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x float> poison, <7 x float> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_OFF_16]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <7 x float> [[RET_PARTS_0]], <7 x float> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 9>
 ; CHECK-NEXT:    ret <7 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -834,10 +862,13 @@ define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
   ret <7 x float> %ret
 }
 
-define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7f32(
-; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -845,10 +876,15 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x float> @load_v8f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x float> poison, <8 x float> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <8 x float> [[RET_PARTS_0]], <8 x float> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -856,10 +892,13 @@ define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
   ret <8 x float> %ret
 }
 
-define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8f32(
-; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -867,10 +906,18 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <10 x float> @load_v10f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <10 x float> poison, <10 x float> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <10 x float> [[RET_PARTS_0]], <10 x float> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; CHECK-NEXT:    [[RET_OFF_32:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_OFF_32]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <10 x float> [[RET_PARTS_4]], <10 x float> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
 ; CHECK-NEXT:    ret <10 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -878,10 +925,15 @@ define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
   ret <10 x float> %ret
 }
 
-define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v10f32(
-; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_8:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <2 x i32> <i32 8, i32 9>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_8]], ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -889,10 +941,15 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v6i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x i32> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -900,10 +957,13 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
   ret <6 x i32> %ret
 }
 
-define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i32(
-; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -911,10 +971,15 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <4 x ptr addrspace(1)> poison, <4 x ptr addrspace(1)> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_16]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x ptr addrspace(1)> [[RET_PARTS_0]], <4 x ptr addrspace(1)> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x ptr addrspace(1)> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -922,10 +987,13 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
   ret <4 x ptr addrspace(1)> %ret
 }
 
-define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4p1(
-; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_2]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -935,10 +1003,11 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
 
 ;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
 
-define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i16> @load_v1i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <1 x i16>
 ; CHECK-NEXT:    ret <1 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -946,10 +1015,11 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
   ret <1 x i16> %ret
 }
 
-define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i16(
-; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i16> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_STORABLE]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -957,10 +1027,14 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i16> @load_v3i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
 ; CHECK-NEXT:    ret <3 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -968,10 +1042,13 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
   ret <3 x i16> %ret
 }
 
-define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i16(
-; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -979,10 +1056,14 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x i16> @load_v5i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_OFF_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_8]], i64 4
 ; CHECK-NEXT:    ret <5 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -990,10 +1071,13 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
   ret <5 x i16> %ret
 }
 
-define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5i16(
-; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1001,10 +1085,11 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i16> @load_v6i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
 ; CHECK-NEXT:    ret <6 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1012,10 +1097,11 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
   ret <6 x i16> %ret
 }
 
-define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i16(
-; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1023,10 +1109,15 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x i16> @load_v7i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_OFF_0]] to <6 x i16>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_OFF_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
+; CHECK-NEXT:    [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_12]], i64 6
 ; CHECK-NEXT:    ret <7 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1034,10 +1125,14 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
   ret <7 x i16> %ret
 }
 
-define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7i16(
-; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_6:%.*]] = extractelement <7 x i16> [[DATA]], i64 6
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1045,10 +1140,14 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <9 x i16> @load_v9i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_OFF_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_16]], i64 8
 ; CHECK-NEXT:    ret <9 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1056,10 +1155,13 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
   ret <9 x i16> %ret
 }
 
-define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v9i16(
-; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1071,10 +1173,11 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
 ;;; - Split into multiple operations
 ;;; - Bitcast if they have a natively supported width
 
-define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i8> @load_v1i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <1 x i8>
 ; CHECK-NEXT:    ret <1 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1082,10 +1185,11 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
   ret <1 x i8> %ret
 }
 
-define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i8(
-; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <1 x i8> [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1093,10 +1197,11 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i8> @load_v2i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1104,10 +1209,11 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
   ret <2 x i8> %ret
 }
 
-define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i8(
-; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1115,10 +1221,15 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i8> @load_v3i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_0]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_OFF_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_2]], i64 2
 ; CHECK-NEXT:    ret <3 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1126,10 +1237,14 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
   ret <3 x i8> %ret
 }
 
-define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i8(
-; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i8> [[DATA]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1137,10 +1252,11 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i8> @load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1148,10 +1264,11 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
   ret <4 x i8> %ret
 }
 
-define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1159,10 +1276,15 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x i8> @load_v5i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
 ; CHECK-NEXT:    ret <5 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1170,10 +1292,14 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
   ret <5 x i8> %ret
 }
 
-define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5i8(
-; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1181,10 +1307,15 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i8> @load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1192,10 +1323,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
   ret <6 x i8> %ret
 }
 
-define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1203,10 +1338,19 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x i8> @load_v7i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_4]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_OFF_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
+; CHECK-NEXT:    [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_OFF_6]], i64 6
 ; CHECK-NEXT:    ret <7 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1214,10 +1358,17 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
   ret <7 x i8> %ret
 }
 
-define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7i8(
-; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_4_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_4]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_6:%.*]] = extractelement <7 x i8> [[DATA]], i64 6
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1225,10 +1376,11 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i8> @load_v8i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
 ; CHECK-NEXT:    ret <8 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1236,10 +1388,11 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
   ret <8 x i8> %ret
 }
 
-define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i8(
-; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1247,10 +1400,11 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <12 x i8> @load_v12i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
 ; CHECK-NEXT:    ret <12 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1258,10 +1412,11 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
   ret <12 x i8> %ret
 }
 
-define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v12i8(
-; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1269,10 +1424,11 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <16 x i8> @load_v16i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1280,10 +1436,11 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
   ret <16 x i8> %ret
 }
 
-define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v16i8(
-; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1291,10 +1448,16 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <32 x i8> @load_v32i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
 ; CHECK-NEXT:    ret <32 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1302,10 +1465,14 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
   ret <32 x i8> %ret
 }
 
-define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i8(
-; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1315,10 +1482,13 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
 
 ;;; Arrays. Need to become vectors.
 
-define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [1 x i32] @load_a1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <1 x i32> [[RET_FROM_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [1 x i32] poison, i32 [[RET_ELEM_0]], 0
 ; CHECK-NEXT:    ret [1 x i32] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1326,10 +1496,13 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
   ret [1 x i32] %ret
 }
 
-define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a1i32(
-; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [1 x i32] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <1 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA_AS_VEC_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1337,10 +1510,14 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x i32] @load_a2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1348,10 +1525,14 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
   ret [2 x i32] %ret
 }
 
-define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2i32(
-; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA_AS_VEC_0]], i32 [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1359,10 +1540,14 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x half] @load_a2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x half] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1370,10 +1555,14 @@ define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
   ret [2 x half] %ret
 }
 
-define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2f16(
-; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x half] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1381,10 +1570,14 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x ptr addrspace(1)] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1392,10 +1585,14 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
   ret [2 x ptr addrspace(1)] %ret
 }
 
-define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2p1(
-; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x ptr addrspace(1)> [[DATA_AS_VEC_0]], ptr addrspace(1) [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_AS_VEC_1]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1405,10 +1602,16 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
 
 ;;; Scalars of atypical width. Need to be cast to vectors and split.
 
-define i40 @load_i40(ptr addrspace(8) %buf) {
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i40 @load_i40(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i8> [[RET_SLICE_4]] to i40
 ; CHECK-NEXT:    ret i40 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1416,10 +1619,15 @@ define i40 @load_i40(ptr addrspace(8) %buf) {
   ret i40 %ret
 }
 
-define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i40(
-; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i40 [[DATA]] to <5 x i8>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA_LEGAL]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1427,10 +1635,11 @@ define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i96 @load_i96(ptr addrspace(8) %buf) {
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i96 @load_i96(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to i96
 ; CHECK-NEXT:    ret i96 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1438,10 +1647,11 @@ define i96 @load_i96(ptr addrspace(8) %buf) {
   ret i96 %ret
 }
 
-define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i96(
-; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i96 [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1449,10 +1659,15 @@ define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i160 @load_i160(ptr addrspace(8) %buf) {
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i160 @load_i160(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i32> poison, <5 x i32> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_OFF_16]], i64 4
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i32> [[RET_SLICE_4]] to i160
 ; CHECK-NEXT:    ret i160 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1460,10 +1675,14 @@ define i160 @load_i160(ptr addrspace(8) %buf) {
   ret i160 %ret
 }
 
-define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i160(
-; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i160 [[DATA]] to <5 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i32> [[DATA_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i32> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1471,10 +1690,16 @@ define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i256 @load_i256(ptr addrspace(8) %buf) {
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i256 @load_i256(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to i256
 ; CHECK-NEXT:    ret i256 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1482,10 +1707,14 @@ define i256 @load_i256(ptr addrspace(8) %buf) {
   ret i256 %ret
 }
 
-define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i256(
-; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i256 [[DATA]] to <8 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1495,10 +1724,11 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
 
 ;;; Non-byte-sized scalars. Require zero-extension.
 
-define i7 @load_i4(ptr addrspace(8) %buf) {
-; CHECK-LABEL: define i7 @load_i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define i7 @load_i7(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
 ; CHECK-NEXT:    ret i7 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1506,10 +1736,11 @@ define i7 @load_i4(ptr addrspace(8) %buf) {
   ret i7 %ret
 }
 
-define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
-; CHECK-LABEL: define void @store_i4(
-; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_i7(
+; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1517,12 +1748,37 @@ define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define i4 @load_i4(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i4
+; CHECK-NEXT:    ret i4 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i4, ptr addrspace(7) %p
+  ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_i4(
+; CHECK-SAME: i4 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i4 [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i4 %data, ptr addrspace(7) %p
+  ret void
+}
+
 ;;; Byte-sized vectors of i4. Require casts.
 
-define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i4> @load_v2i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <2 x i4>
 ; CHECK-NEXT:    ret <2 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1530,10 +1786,11 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
   ret <2 x i4> %ret
 }
 
-define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i4(
-; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i4> [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1541,10 +1798,11 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i4> @load_v4i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
 ; CHECK-NEXT:    ret <4 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1552,10 +1810,11 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
   ret <4 x i4> %ret
 }
 
-define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i4(
-; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1563,10 +1822,11 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i4> @load_v8i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
 ; CHECK-NEXT:    ret <8 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1574,10 +1834,11 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
   ret <8 x i4> %ret
 }
 
-define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i4(
-; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1587,10 +1848,12 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
 
 ;;; Vectors of non-byte-sized integers.
 
-define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i6> @load_v2i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_TRUNC:%.*]] = trunc i16 [[RET_LOADABLE]] to i12
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i12 [[RET_TRUNC]] to <2 x i6>
 ; CHECK-NEXT:    ret <2 x i6> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1598,10 +1861,12 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
   ret <2 x i6> %ret
 }
 
-define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i6(
-; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_AS_SCALAR:%.*]] = bitcast <2 x i6> [[DATA]] to i12
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i12 [[DATA_AS_SCALAR]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_ZEXT]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1610,10 +1875,16 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
 }
 
 ;; Blocks of fp6 elements
-define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v32i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
 ; CHECK-NEXT:    [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
 ; CHECK-NEXT:    ret <6 x i32> [[RET_CAST]]
 ;
@@ -1623,11 +1894,15 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
   ret <6 x i32> %ret.cast
 }
 
-define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i6(
-; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %data = bitcast <6 x i32> %data.abi to <32 x i6>
@@ -1638,10 +1913,11 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
 
 ;;; Modifiers
 
-define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1649,10 +1925,11 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
   ret <4 x i8> %ret
 }
 
-define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @volatile_store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1660,10 +1937,15 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1671,13 +1953,257 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
   ret <6 x i8> %ret
 }
 
-define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @volatile_store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   store volatile <6 x i8> %data, ptr addrspace(7) %p
   ret void
 }
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x [2 x i32]] @load_a2a2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0_ELEM_0:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 0
+; CHECK-NEXT:    [[RET0_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET0_ELEM_0]], 0
+; CHECK-NEXT:    [[RET0_ELEM_1:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 1
+; CHECK-NEXT:    [[RET0_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET0_AS_ARRAY_0]], i32 [[RET0_ELEM_1]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x [2 x i32]] poison, [2 x i32] [[RET0_AS_ARRAY_1]], 0
+; CHECK-NEXT:    [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET1_ELEM_0:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 0
+; CHECK-NEXT:    [[RET1_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET1_ELEM_0]], 0
+; CHECK-NEXT:    [[RET1_ELEM_1:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 1
+; CHECK-NEXT:    [[RET1_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET1_AS_ARRAY_0]], i32 [[RET1_ELEM_1]], 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x [2 x i32]] [[RET0]], [2 x i32] [[RET1_AS_ARRAY_1]], 1
+; CHECK-NEXT:    ret [2 x [2 x i32]] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+  ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2a2i32(
+; CHECK-SAME: [2 x [2 x i32]] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 0
+; CHECK-NEXT:    [[DATA0_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA0]], 0
+; CHECK-NEXT:    [[DATA0_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA0_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA0_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA0]], 1
+; CHECK-NEXT:    [[DATA0_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA0_AS_VEC_0]], i32 [[DATA0_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 1
+; CHECK-NEXT:    [[DATA1_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA1]], 0
+; CHECK-NEXT:    [[DATA1_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA1_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA1_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA1]], 1
+; CHECK-NEXT:    [[DATA1_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA1_AS_VEC_0]], i32 [[DATA1_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x <2 x i32>] @load_a2v2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[RET0_OFF_0]], 0
+; CHECK-NEXT:    [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x <2 x i32>] [[RET0]], <2 x i32> [[RET1_OFF_8]], 1
+; CHECK-NEXT:    ret [2 x <2 x i32>] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+  ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2v2i32(
+; CHECK-SAME: [2 x <2 x i32>] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i32 } @load_sl_i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { i32 } poison, i32 [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    ret { i32 } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i32 }, ptr addrspace(7) %p
+  ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i32s(
+; CHECK-SAME: { i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { i32 } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { { float } } @load_sl_sl_f32ss(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_0_OFF_0:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { { float } } poison, float [[RET_0_0_OFF_0]], 0, 0
+; CHECK-NEXT:    ret { { float } } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { { float } }, ptr addrspace(7) %p
+  ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_sl_f32ss(
+; CHECK-SAME: { { float } } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0_0:%.*]] = extractvalue { { float } } [[DATA]], 0, 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_0_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { { float } } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { <2 x i32> } @load_sl_v2i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { <2 x i32> } poison, <2 x i32> [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    ret { <2 x i32> } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { <2 x i32> }, ptr addrspace(7) %p
+  ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_v2i32s(
+; CHECK-SAME: { <2 x i32> } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { <2 x i32> } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { <2 x i32> } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i64, i32 } @load_sl_i64i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_0:%.*]] = insertvalue { i64, i32 } poison, i64 [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    [[RET_1_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { i64, i32 } [[RET_0]], i32 [[RET_1_OFF_8]], 1
+; CHECK-NEXT:    ret { i64, i32 } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i64, i32 }, ptr addrspace(7) %p
+  ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i64i32s(
+; CHECK-SAME: { i64, i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { i64, i32 } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_1:%.*]] = extractvalue { i64, i32 } [[DATA]], 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i64, i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [4 x i7] @load_a4i7(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0_TRUNC:%.*]] = trunc i8 [[RET0_OFF_0]] to i7
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [4 x i7] poison, i7 [[RET0_TRUNC]], 0
+; CHECK-NEXT:    [[RET1_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[RET1_TRUNC:%.*]] = trunc i8 [[RET1_OFF_1]] to i7
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [4 x i7] [[RET0]], i7 [[RET1_TRUNC]], 1
+; CHECK-NEXT:    [[RET2_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[RET2_TRUNC:%.*]] = trunc i8 [[RET2_OFF_2]] to i7
+; CHECK-NEXT:    [[RET2:%.*]] = insertvalue [4 x i7] [[RET1]], i7 [[RET2_TRUNC]], 2
+; CHECK-NEXT:    [[RET3_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT:    [[RET3_TRUNC:%.*]] = trunc i8 [[RET3_OFF_3]] to i7
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [4 x i7] [[RET2]], i7 [[RET3_TRUNC]], 3
+; CHECK-NEXT:    ret [4 x i7] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [4 x i7], ptr addrspace(7) %p
+  ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a4i7(
+; CHECK-SAME: [4 x i7] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [4 x i7] [[DATA]], 0
+; CHECK-NEXT:    [[DATA0_ZEXT:%.*]] = zext i7 [[DATA0]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA0_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [4 x i7] [[DATA]], 1
+; CHECK-NEXT:    [[DATA1_ZEXT:%.*]] = zext i7 [[DATA1]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA1_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA2:%.*]] = extractvalue [4 x i7] [[DATA]], 2
+; CHECK-NEXT:    [[DATA2_ZEXT:%.*]] = zext i7 [[DATA2]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA2_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA3:%.*]] = extractvalue [4 x i7] [[DATA]], 3
+; CHECK-NEXT:    [[DATA3_ZEXT:%.*]] = zext i7 [[DATA3]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA3_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [4 x i7] %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Scalable vector. This isn't semantically meaningful but shouldn't crash.
+
+define <vscale x 2 x i32> @load_nxv2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <vscale x 2 x i32> @load_nxv2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <vscale x 2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.nxv2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <vscale x 2 x i32>, ptr addrspace(7) %p
+  ret <vscale x 2 x i32> %ret
+}
+
+define void @store_nxv2i32(<vscale x 2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_nxv2i32(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <vscale x 2 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 90fc3cf3d72ea3..c3762e2cfff328 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -54,7 +54,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[BUF_PTR_4_PTR_INT]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], i64 4, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
 ; CHECK-NEXT:    ret float [[RET]], !dbg [[DBG34:![0-9]+]]
 ;
   %buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20



More information about the llvm-commits mailing list