[llvm] [AMDGPU] Handle natively unsupported types in addrspace(7) lowering (PR #110572)

Krzysztof Drewniak via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 18 10:08:31 PST 2024


https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/110572

>From e4ccad26b7d4f3f2f97b7f5221c02bf846f2958e Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 27 Sep 2024 19:18:46 +0000
Subject: [PATCH 01/11] [AMDGPU] Handle natively unsupported types in
 addrspace(7) lowering

The current lowering for ptr addrspace(7) assumed that the instruction
selector can handle arbtrary LLVM types, which is not the case. Code
generation can't deal with
- Values that aren't 8, 16, 32, 64, 96, or 128 bits long
- Aggregates (this commit only handles arrays of scalars, more may come)
- Vectors of more than one byte
- 3-word values that aren't a vector of 3 32-bit values (for axample, a
  <6 x half>)

This commit adds a buffer contents type legalizer that adds the needed
bitcasts, zero-extensions, and splits into subcompnents needed to convert a
load or store operation into one that can be successfully lowered through
code generation.

In the long run, some of the involved bitcasts (though potentially not
the buffer operation splitting) ought to be handled by the instruction
legalizer, but SelectionDAG makes this difficult.

It also takes advantage of the new `nuw` flag on `getelementptr` when
lowering GEPs to offset additions.

We don't currently plumb through `nsw` on GEPs since that should likely
be a separate change and would require declaring what we mean by
"the address" in the context of the GEP guarantees.
---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |  463 +-
 ...ffer-fat-pointers-contents-legalization.ll | 4871 +++++++++++++++++
 .../AMDGPU/lower-buffer-fat-pointers-calls.ll |    9 +-
 ...ffer-fat-pointers-contents-legalization.ll |  430 +-
 ...fer-fat-pointers-unoptimized-debug-data.ll |    7 +-
 5 files changed, 5685 insertions(+), 95 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 787747e6055805..831474c192526f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -86,6 +86,25 @@
 // This phase also records intrinsics so that they can be remangled or deleted
 // later.
 //
+// ## Buffer contents type legalization
+//
+// The underlying buffer intrinsics only support types up to 128 bits long,
+// and don't support complex types. If buffer operations were
+// standard pointer operations that could be represented as MIR-level loads,
+// this would be handled by the various legalization schemes in instruction
+// selection. However, because we have to do the conversion from `load` and
+// `store` to intrinsics at LLVM IR level, we must perform that legalization
+// ourselves.
+//
+// This involves a combination of
+// - Converting arrays to vectors where possible
+// - Zero-extending things to fill a whole number of bytes
+// - Casting values of types that don't neatly correspond to supported machine
+// value
+//   (for example, an i96 or i256) into ones that would work (
+//    like <3 x i32> and <8 x i32>, respectively)
+// - Splitting values that are too long (such as aforementioned <8 x i32>) into
+//   multiple operations.
 //
 // ## Splitting pointer structs
 //
@@ -218,6 +237,7 @@
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -551,7 +571,6 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
   auto *NLI = cast<LoadInst>(LI.clone());
   NLI->mutateType(IntTy);
   NLI = IRB.Insert(NLI);
-  copyMetadataForLoad(*NLI, LI);
   NLI->takeName(&LI);
 
   Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName());
@@ -576,6 +595,434 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
   return true;
 }
 
+namespace {
+/// Convert loads/stores of types that the buffer intrinsics can't handle into
+/// one ore more such loads/stores that consist of legal types.
+///
+/// Do this by
+/// 1. Converting arrays of non-aggregate, byte-sized types into their
+/// correspondinng vectors
+/// 2. Bitcasting unsupported types, namely overly-long scalars and byte
+/// vectors, into vectors of supported types.
+/// 3. Splitting up excessively long reads/writes into multiple operations.
+///
+/// Note that this doesn't handle complex data strucures, but, in the future,
+/// the aggregate load splitter from SROA could be refactored to allow for that
+/// case.
+class LegalizeBufferContentTypesVisitor
+    : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
+  friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
+
+  IRBuilder<> IRB;
+
+  const DataLayout &DL;
+
+  /// If T is [N x U], where U is a scalar type, return the vector type
+  /// <N x U>, otherwise, return T.
+  Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
+  Value *arrayToVector(Value *V, Type *TargetType, StringRef Name);
+  Value *vectorToArray(Value *V, Type *OrigType, StringRef Name);
+
+  /// Break up the loads of a struct into the loads of its components
+
+  /// Convert a vector or scalar type that can't be operated on by buffer
+  /// intrinsics to one that would be legal through bitcasts and/or truncation.
+  /// Uses the wider of i32, i16, or i8 where possible.
+  Type *legalNonAggregateFor(Type *T);
+  Value *makeLegalNonAggregate(Value *V, Type *TargetType, StringRef Name);
+  Value *makeIllegalNonAggregate(Value *V, Type *OrigType, StringRef Name);
+
+  struct Slice {
+    unsigned Offset;
+    unsigned Length;
+    Slice(unsigned Offset, unsigned Length) : Offset(Offset), Length(Length) {}
+  };
+  // Return the [offset, length] pairs into which `T` needs to be cut to form
+  // legal buffer load or store operations. Clears `Slices`. Creates an empty
+  // `Slices` for non-vector inputs and creates one slice if no slicing will be
+  // needed.
+  void getSlices(Type *T, SmallVectorImpl<Slice> &Slices);
+
+  Value *extractSlice(Value *Vec, Slice S, StringRef Name);
+  Value *insertSlice(Value *Whole, Value *Part, Slice S, StringRef Name);
+
+  // In most cases, return `LegalType`. However, when given an input that would
+  // normally be a legal type for the buffer intrinsics to return but that isn't
+  // hooked up through SelectionDAG, return a type of the same width that can be
+  // used with the relevant intrinsics. Specifically, handle the cases:
+  // - <1 x T> => T for all T
+  // - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+  // - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+  // i32>
+  Type *intrinsicTypeFor(Type *LegalType);
+
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+
+public:
+  LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
+      : IRB(Ctx), DL(DL) {}
+  bool processFunction(Function &F);
+};
+} // namespace
+
+Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
+  ArrayType *AT = dyn_cast<ArrayType>(T);
+  if (!AT)
+    return T;
+  Type *ET = AT->getElementType();
+  if (!ET->isSingleValueType() || isa<VectorType>(ET))
+    report_fatal_error(
+        "loading non-scalar arrays from buffer fat pointers is unimplemented");
+  if (!DL.typeSizeEqualsStoreSize(AT))
+    report_fatal_error(
+        "loading padded arrays from buffer fat pinters is unimplemented");
+  return FixedVectorType::get(ET, AT->getNumElements());
+}
+
+Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
+                                                        Type *TargetType,
+                                                        StringRef Name) {
+  Value *VectorRes = PoisonValue::get(TargetType);
+  auto *VT = cast<FixedVectorType>(TargetType);
+  unsigned EC = VT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractValue(V, I, Name + ".elem." + Twine(I));
+    VectorRes = IRB.CreateInsertElement(VectorRes, Elem, I,
+                                        Name + ".as.vec." + Twine(I));
+  }
+  return VectorRes;
+}
+
+Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
+                                                        Type *OrigType,
+                                                        StringRef Name) {
+  Value *ArrayRes = PoisonValue::get(OrigType);
+  ArrayType *AT = cast<ArrayType>(OrigType);
+  unsigned EC = AT->getNumElements();
+  for (auto I : iota_range<unsigned>(0, EC, /*Inclusive=*/false)) {
+    Value *Elem = IRB.CreateExtractElement(V, I, Name + ".elem." + Twine(I));
+    ArrayRes = IRB.CreateInsertValue(ArrayRes, Elem, I,
+                                     Name + ".as.array." + Twine(I));
+  }
+  return ArrayRes;
+}
+
+Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
+  TypeSize Size = DL.getTypeStoreSizeInBits(T);
+  // Implicitly zero-extend to the next byte if needed
+  if (!DL.typeSizeEqualsStoreSize(T))
+    T = IRB.getIntNTy(Size.getFixedValue());
+  auto *VT = dyn_cast<VectorType>(T);
+  Type *ElemTy = T;
+  if (VT) {
+    ElemTy = VT->getElementType();
+  }
+  if (isa<PointerType>(ElemTy))
+    return T; // Pointers are always big enough
+  unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
+  if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
+    // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
+    // legal buffer operations.
+    return T;
+  }
+  Type *BestVectorElemType = nullptr;
+  if (Size.isKnownMultipleOf(32))
+    BestVectorElemType = IRB.getInt32Ty();
+  else if (Size.isKnownMultipleOf(16))
+    BestVectorElemType = IRB.getInt16Ty();
+  else
+    BestVectorElemType = IRB.getInt8Ty();
+  unsigned NumCastElems =
+      Size.getFixedValue() / BestVectorElemType->getIntegerBitWidth();
+  if (NumCastElems == 1)
+    return BestVectorElemType;
+  return FixedVectorType::get(BestVectorElemType, NumCastElems);
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
+    Value *V, Type *TargetType, StringRef Name) {
+  Type *SourceType = V->getType();
+  if (DL.getTypeSizeInBits(SourceType) != DL.getTypeSizeInBits(TargetType)) {
+    Type *ShortScalarTy =
+        IRB.getIntNTy(DL.getTypeSizeInBits(SourceType).getFixedValue());
+    Type *ByteScalarTy =
+        IRB.getIntNTy(DL.getTypeSizeInBits(TargetType).getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
+    Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
+    V = Zext;
+    SourceType = ByteScalarTy;
+  }
+  if (SourceType == TargetType)
+    return V;
+  return IRB.CreateBitCast(V, TargetType, Name + ".legal");
+}
+
+Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
+    Value *V, Type *OrigType, StringRef Name) {
+  Type *LegalType = V->getType();
+  if (DL.getTypeSizeInBits(LegalType) != DL.getTypeSizeInBits(OrigType)) {
+    Type *ShortScalarTy =
+        IRB.getIntNTy(DL.getTypeSizeInBits(OrigType).getFixedValue());
+    Type *ByteScalarTy =
+        IRB.getIntNTy(DL.getTypeSizeInBits(LegalType).getFixedValue());
+    Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
+    Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
+    if (OrigType != ShortScalarTy)
+      return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
+    return Trunc;
+  }
+  if (LegalType == OrigType)
+    return V;
+  return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
+}
+
+Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
+  auto *VT = dyn_cast<FixedVectorType>(LegalType);
+  if (!VT)
+    return LegalType;
+  Type *ET = VT->getElementType();
+  if (VT->getNumElements() == 1)
+    return ET;
+  if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
+    return FixedVectorType::get(IRB.getInt32Ty(), 3);
+  if (ET->isIntegerTy(8)) {
+    switch (VT->getNumElements()) {
+    default:
+      return LegalType; // Let it crash later
+    case 1:
+      return IRB.getInt8Ty();
+    case 2:
+      return IRB.getInt16Ty();
+    case 4:
+      return IRB.getInt32Ty();
+    case 8:
+      return FixedVectorType::get(IRB.getInt32Ty(), 2);
+    case 16:
+      return FixedVectorType::get(IRB.getInt32Ty(), 4);
+    }
+  }
+  return LegalType;
+}
+
+void LegalizeBufferContentTypesVisitor::getSlices(
+    Type *T, SmallVectorImpl<Slice> &Slices) {
+  Slices.clear();
+  auto *VT = dyn_cast<FixedVectorType>(T);
+  if (!VT)
+    return;
+
+  unsigned ElemBitWidth =
+      DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
+
+  unsigned ElemsPer4Words = 128 / ElemBitWidth;
+  unsigned ElemsPer2Words = ElemsPer4Words / 2;
+  unsigned ElemsPerWord = ElemsPer2Words / 2;
+  unsigned ElemsPerShort = ElemsPerWord / 2;
+  unsigned ElemsPerByte = ElemsPerShort / 2;
+  // If the elements evenly pack into 32-bit words, we can use 3-word stores,
+  // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
+  // example, <3 x i64>, since that's not slicing.
+  unsigned ElemsPer3Words = ElemsPerWord * 3;
+
+  unsigned TotalElems = VT->getNumElements();
+  unsigned Off = 0;
+  auto TrySlice = [&](unsigned MaybeLen) {
+    if (MaybeLen > 0 && Off + MaybeLen <= TotalElems) {
+      Slices.emplace_back(/*Offset=*/Off, /*Length=*/MaybeLen);
+      Off += MaybeLen;
+      return true;
+    }
+    return false;
+  };
+  while (Off < TotalElems) {
+    TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
+        TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
+        TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
+  }
+}
+
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, Slice S,
+                                                       StringRef Name) {
+  if (S.Length == 1)
+    return IRB.CreateExtractElement(Vec, S.Offset,
+                                    Name + ".slice." + Twine(S.Offset));
+  SmallVector<int> Mask = llvm::to_vector(llvm::iota_range<int>(
+      S.Offset, S.Offset + S.Length, /*Inclusive=*/false));
+  return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Offset));
+}
+
+Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
+                                                      Slice S, StringRef Name) {
+  if (S.Length == 1) {
+    return IRB.CreateInsertElement(Whole, Part, S.Offset,
+                                   Name + ".slice." + Twine(S.Offset));
+  }
+  int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
+
+  // Extend the slice with poisons to make the main shufflevector happy.
+  SmallVector<int> ExtPartMask(NumElems, -1);
+  for (auto [I, E] : llvm::enumerate(
+           MutableArrayRef<int>(ExtPartMask).take_front(S.Length))) {
+    E = I;
+  }
+  Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
+                                           Name + ".ext." + Twine(S.Offset));
+
+  SmallVector<int> Mask =
+      llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
+  for (auto [I, E] :
+       llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Offset, S.Length)))
+    E = I + NumElems;
+  return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
+                                 Name + ".parts." + Twine(S.Offset));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+  if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  Type *OrigType = LI.getType();
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+
+  SmallVector<Slice> Slices;
+  getSlices(LegalType, Slices);
+  bool NeedToSplit = Slices.size() > 1;
+  Value *LoadsRes;
+  StringRef Name = LI.getName();
+  if (!NeedToSplit) {
+    Type *LoadableType = intrinsicTypeFor(LegalType);
+    if (LoadableType == OrigType)
+      return false;
+
+    IRB.SetInsertPoint(&LI);
+    auto *NLI = cast<LoadInst>(LI.clone());
+    NLI->mutateType(LoadableType);
+    NLI = IRB.Insert(NLI);
+    NLI->setName(Name + ".loadable");
+
+    LoadsRes = NLI;
+    if (LoadableType != LegalType) {
+      LoadsRes =
+          IRB.CreateBitCast(LoadsRes, LegalType, Name + ".from.loadable");
+    }
+  } else {
+    IRB.SetInsertPoint(&LI);
+    LoadsRes = PoisonValue::get(LegalType);
+    Value *OrigPtr = LI.getPointerOperand();
+    // If we're needing to spill something into more than one load, its legal
+    // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
+    Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+    unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+    AAMDNodes AANodes = LI.getAAMetadata();
+    for (Slice S : Slices) {
+      Type *SliceType =
+          S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+      unsigned ByteOffset = S.Offset * ElemBytes;
+      // You can't reasonably expect loads to wrap around the edge of memory.
+      Value *NewPtr = IRB.CreateGEP(
+          IRB.getInt8Ty(), LI.getPointerOperand(), IRB.getInt32(ByteOffset),
+          OrigPtr->getName() + ".part.ptr." + Twine(S.Offset),
+          GEPNoWrapFlags::noUnsignedWrap());
+      Type *LoadableType = intrinsicTypeFor(SliceType);
+      LoadInst *NewLI = IRB.CreateAlignedLoad(
+          LoadableType, NewPtr, commonAlignment(LI.getAlign(), ByteOffset),
+          Name + ".part." + Twine(S.Offset));
+      copyMetadataForLoad(*NewLI, LI);
+      NewLI->setAAMetadata(
+          AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
+      if (LI.isAtomic())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+      if (LI.isVolatile())
+        NewLI->setVolatile(LI.isVolatile());
+      Value *Loaded = NewLI;
+      if (LoadableType != SliceType)
+        Loaded = IRB.CreateBitCast(NewLI, SliceType,
+                                   NewLI->getName() + ".from.loadable");
+      LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
+    }
+  }
+  if (LegalType != ArrayAsVecType)
+    LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
+  if (ArrayAsVecType != OrigType)
+    LoadsRes = vectorToArray(LoadsRes, OrigType, Name);
+  LoadsRes->takeName(&LI);
+  LI.replaceAllUsesWith(LoadsRes);
+  LI.eraseFromParent();
+  return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+  if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  IRB.SetInsertPoint(&SI);
+  Value *OrigData = SI.getValueOperand();
+  Type *OrigType = OrigData->getType();
+  StringRef Name = OrigData->getName();
+  Value *NewData = OrigData;
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
+  if (ArrayAsVecType != OrigType) {
+    NewData = arrayToVector(NewData, ArrayAsVecType, Name);
+  }
+
+  Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
+  if (LegalType != ArrayAsVecType) {
+    NewData = makeLegalNonAggregate(NewData, LegalType, Name);
+  }
+
+  SmallVector<Slice> Slices;
+  getSlices(LegalType, Slices);
+  bool NeedToSplit = Slices.size() > 1;
+  if (!NeedToSplit) {
+    Type *StorableType = intrinsicTypeFor(LegalType);
+    if (StorableType == OrigType)
+      return false;
+    if (StorableType != LegalType)
+      NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+
+    SI.setOperand(0, NewData);
+    return true;
+  }
+
+  Value *OrigPtr = SI.getPointerOperand();
+  Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+  unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
+  AAMDNodes AANodes = SI.getAAMetadata();
+  for (Slice S : Slices) {
+    Type *SliceType =
+        S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
+    unsigned ByteOffset = S.Offset * ElemBytes;
+    Value *NewPtr =
+        IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
+                      OrigPtr->getName() + ".part." + Twine(S.Offset),
+                      GEPNoWrapFlags::noUnsignedWrap());
+    Value *DataSlice = extractSlice(NewData, S, Name);
+    Type *StorableType = intrinsicTypeFor(SliceType);
+    if (StorableType != SliceType) {
+      DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+                                    DataSlice->getName() + ".storable");
+    }
+    auto *NewSI = cast<StoreInst>(SI.clone());
+    NewSI->setAlignment(commonAlignment(SI.getAlign(), ByteOffset));
+    IRB.Insert(NewSI);
+    NewSI->setOperand(0, DataSlice);
+    NewSI->setOperand(1, NewPtr);
+    NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
+  }
+  SI.eraseFromParent();
+  return true;
+}
+
+bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
+  bool Changed = false;
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    Changed |= visit(I);
+  }
+  return Changed;
+}
+
 /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered
 /// buffer fat pointer constant.
 static std::pair<Constant *, Constant *>
@@ -1256,7 +1703,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   auto [Rsrc, Off] = getPtrParts(Ptr);
   const DataLayout &DL = GEP.getDataLayout();
-  bool InBounds = GEP.isInBounds();
+  bool IsNUW = GEP.hasNoUnsignedWrap();
 
   // In order to call emitGEPOffset() and thus not have to reimplement it,
   // we need the GEP result to have ptr addrspace(7) type.
@@ -1271,16 +1718,12 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return {Rsrc, Off};
   }
 
-  bool HasNonNegativeOff = false;
-  if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
-    HasNonNegativeOff = !CI->isNegative();
-  }
   Value *NewOff;
   if (match(Off, m_Zero())) {
     NewOff = OffAccum;
   } else {
     NewOff = IRB.CreateAdd(Off, OffAccum, "",
-                           /*hasNUW=*/InBounds && HasNonNegativeOff,
+                           /*hasNUW=*/IsNUW,
                            /*hasNSW=*/false);
   }
   copyMetadata(NewOff, &GEP);
@@ -1781,12 +2224,16 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   }
 
   StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
+  LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
+                                                              M.getContext());
   for (Function &F : M.functions()) {
     bool InterfaceChange = hasFatPointerInterface(F, &StructTM);
     bool BodyChanges = containsBufferFatPointers(F, &StructTM);
     Changed |= MemOpsRewrite.processFunction(F);
-    if (InterfaceChange || BodyChanges)
+    if (InterfaceChange || BodyChanges) {
       NeedsRemap.push_back(std::make_pair(&F, InterfaceChange));
+      Changed |= BufferContentsTypeRewrite.processFunction(F);
+    }
   }
   if (NeedsRemap.empty())
     return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
new file mode 100644
index 00000000000000..b8d01c12b5b180
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -0,0 +1,4871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+
+; Note: if you're adding tests here, also add them to
+; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
+; the lowering.
+
+;;; Legal types. These are natively supported, no casts should be performed.
+
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i8, ptr addrspace(7) %p
+  ret i8 %ret
+}
+
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i8 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i16, ptr addrspace(7) %p
+  ret i16 %ret
+}
+
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i16 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i32, ptr addrspace(7) %p
+  ret i32 %ret
+}
+
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i32 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i64, ptr addrspace(7) %p
+  ret i64 %ret
+}
+
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i64 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i128:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i128:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i128, ptr addrspace(7) %p
+  ret i128 %ret
+}
+
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i128:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i128:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i128 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i32>, ptr addrspace(7) %p
+  ret <1 x i32> %ret
+}
+
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i32>, ptr addrspace(7) %p
+  ret <2 x i32> %ret
+}
+
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i32>, ptr addrspace(7) %p
+  ret <3 x i32> %ret
+}
+
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i32>, ptr addrspace(7) %p
+  ret <4 x i32> %ret
+}
+
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i16>, ptr addrspace(7) %p
+  ret <2 x i16> %ret
+}
+
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i16>, ptr addrspace(7) %p
+  ret <4 x i16> %ret
+}
+
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i16>, ptr addrspace(7) %p
+  ret <8 x i16> %ret
+}
+
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i64>, ptr addrspace(7) %p
+  ret <2 x i64> %ret
+}
+
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i64> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define half @load_f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load half, ptr addrspace(7) %p
+  ret half %ret
+}
+
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store half %data, ptr addrspace(7) %p
+  ret void
+}
+
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load bfloat, ptr addrspace(7) %p
+  ret bfloat %ret
+}
+
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store bfloat %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x half>, ptr addrspace(7) %p
+  ret <2 x half> %ret
+}
+
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x bfloat>, ptr addrspace(7) %p
+  ret <4 x bfloat> %ret
+}
+
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4bf16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4bf16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x bfloat> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x half>, ptr addrspace(7) %p
+  ret <8 x half> %ret
+}
+
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define float @load_f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load float, ptr addrspace(7) %p
+  ret float %ret
+}
+
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store float %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x float>, ptr addrspace(7) %p
+  ret <2 x float> %ret
+}
+
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x float>, ptr addrspace(7) %p
+  ret <3 x float> %ret
+}
+
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x float>, ptr addrspace(7) %p
+  ret <4 x float> %ret
+}
+
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(0), ptr addrspace(7) %p
+  ret ptr addrspace(0) %ret
+}
+
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(0) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(1), ptr addrspace(7) %p
+  ret ptr addrspace(1) %ret
+}
+
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(1) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(2), ptr addrspace(7) %p
+  ret ptr addrspace(2) %ret
+}
+
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(2) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(3), ptr addrspace(7) %p
+  ret ptr addrspace(3) %ret
+}
+
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(3) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(4), ptr addrspace(7) %p
+  ret ptr addrspace(4) %ret
+}
+
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(4) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(5), ptr addrspace(7) %p
+  ret ptr addrspace(5) %ret
+}
+
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(5) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(6), ptr addrspace(7) %p
+  ret ptr addrspace(6) %ret
+}
+
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(6) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_p8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_p8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load ptr addrspace(8), ptr addrspace(7) %p
+  ret ptr addrspace(8) %ret
+}
+
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_p8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_p8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store ptr addrspace(8) %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(1)> %ret
+}
+
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <2 x ptr addrspace(5)> %ret
+}
+
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <3 x ptr addrspace(5)> %ret
+}
+
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(5)> %ret
+}
+
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p5:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p5:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
+
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x half>, ptr addrspace(7) %p
+  ret <6 x half> %ret
+}
+
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x half> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Long types (32 bit elements). Must be split into multiple operations.
+
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x float>, ptr addrspace(7) %p
+  ret <5 x float> %ret
+}
+
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x float>, ptr addrspace(7) %p
+  ret <6 x float> %ret
+}
+
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x float>, ptr addrspace(7) %p
+  ret <7 x float> %ret
+}
+
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x float>, ptr addrspace(7) %p
+  ret <8 x float> %ret
+}
+
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v10f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v10f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <10 x float>, ptr addrspace(7) %p
+  ret <10 x float> %ret
+}
+
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v10f32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v10f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <10 x float> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i32>, ptr addrspace(7) %p
+  ret <6 x i32> %ret
+}
+
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i32> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
+  ret <4 x ptr addrspace(1)> %ret
+}
+
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
+
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i16>, ptr addrspace(7) %p
+  ret <1 x i16> %ret
+}
+
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i16>, ptr addrspace(7) %p
+  ret <3 x i16> %ret
+}
+
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i16>, ptr addrspace(7) %p
+  ret <5 x i16> %ret
+}
+
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i16>, ptr addrspace(7) %p
+  ret <6 x i16> %ret
+}
+
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i16>, ptr addrspace(7) %p
+  ret <7 x i16> %ret
+}
+
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v9i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v9i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_mov_b32 s4, 0xffff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GISEL-NEXT:    v_bfi_b32 v1, s4, v1, v1
+; GISEL-NEXT:    v_bfi_b32 v2, s4, v2, v2
+; GISEL-NEXT:    v_bfi_b32 v3, s4, v3, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <9 x i16>, ptr addrspace(7) %p
+  ret <9 x i16> %ret
+}
+
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v9i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v9i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <9 x i16> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Byte vectors. Need to be
+;;; - Split into multiple operations
+;;; - Bitcast if they have a natively supported width
+
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v1i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v1i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <1 x i8>, ptr addrspace(7) %p
+  ret <1 x i8> %ret
+}
+
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v1i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v1i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <1 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i8>, ptr addrspace(7) %p
+  ret <2 x i8> %ret
+}
+
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v3i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v3i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <3 x i8>, ptr addrspace(7) %p
+  ret <3 x i8> %ret
+}
+
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v3i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v3i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <3 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v5i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v5i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <5 x i8>, ptr addrspace(7) %p
+  ret <5 x i8> %ret
+}
+
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v5i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v5i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v6, 8
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v5, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <5 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v7i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT:    s_waitcnt vmcnt(2)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v7i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <7 x i8>, ptr addrspace(7) %p
+  ret <7 x i8> %ret
+}
+
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v7i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_byte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v7i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v8, 8
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v7, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GISEL-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_byte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <7 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i8>, ptr addrspace(7) %p
+  ret <8 x i8> %ret
+}
+
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx2 v[3:4], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v9, 8
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v8, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v12i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, v14
+; SDAG-NEXT:    v_mov_b32_e32 v2, v13
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v12i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v12
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <12 x i8>, ptr addrspace(7) %p
+  ret <12 x i8> %ret
+}
+
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v12i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v12i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v13, 8
+; GISEL-NEXT:    v_mov_b32_e32 v12, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v12, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v12, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <12 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v16i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[18:19], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v17, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v12, v3
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v16
+; SDAG-NEXT:    v_mov_b32_e32 v3, v18
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v16i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v12, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v16
+; GISEL-NEXT:    v_mov_b32_e32 v2, v17
+; GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <16 x i8>, ptr addrspace(7) %p
+  ret <16 x i8> %ret
+}
+
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v16i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[9:12], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v16i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v17, 8
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v16, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v6
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v7
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v3, v17, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT:    v_and_or_b32 v3, v12, v16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <16 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[33:36], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[48:51], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[33:34]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[35:36]
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b64 v[19:20], 24, v[48:49]
+; SDAG-NEXT:    v_lshrrev_b64 v[27:28], 24, v[50:51]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v33
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v34
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v35
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v35
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 8, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 16, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
+; SDAG-NEXT:    v_lshrrev_b32_e32 v17, 8, v48
+; SDAG-NEXT:    v_lshrrev_b32_e32 v18, 16, v48
+; SDAG-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v22, 16, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v23, 24, v49
+; SDAG-NEXT:    v_lshrrev_b32_e32 v25, 8, v50
+; SDAG-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
+; SDAG-NEXT:    v_lshrrev_b32_e32 v29, 8, v51
+; SDAG-NEXT:    v_lshrrev_b32_e32 v30, 16, v51
+; SDAG-NEXT:    v_lshrrev_b32_e32 v31, 24, v51
+; SDAG-NEXT:    v_mov_b32_e32 v0, v33
+; SDAG-NEXT:    v_mov_b32_e32 v4, v34
+; SDAG-NEXT:    v_mov_b32_e32 v8, v35
+; SDAG-NEXT:    v_mov_b32_e32 v12, v36
+; SDAG-NEXT:    v_mov_b32_e32 v16, v48
+; SDAG-NEXT:    v_mov_b32_e32 v20, v49
+; SDAG-NEXT:    v_mov_b32_e32 v24, v50
+; SDAG-NEXT:    v_mov_b32_e32 v28, v51
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
+; GISEL-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
+; GISEL-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
+; GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
+; GISEL-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v8, v2
+; GISEL-NEXT:    v_mov_b32_e32 v12, v3
+; GISEL-NEXT:    v_mov_b32_e32 v20, v17
+; GISEL-NEXT:    v_mov_b32_e32 v24, v18
+; GISEL-NEXT:    v_mov_b32_e32 v28, v19
+; GISEL-NEXT:    v_mov_b32_e32 v1, v35
+; GISEL-NEXT:    v_mov_b32_e32 v2, v36
+; GISEL-NEXT:    v_mov_b32_e32 v3, v37
+; GISEL-NEXT:    v_mov_b32_e32 v17, v32
+; GISEL-NEXT:    v_mov_b32_e32 v18, v33
+; GISEL-NEXT:    v_mov_b32_e32 v19, v34
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i8>, ptr addrspace(7) %p
+  ret <32 x i8> %ret
+}
+
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
+; SDAG-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
+; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
+; SDAG-NEXT:    v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
+; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
+; SDAG-NEXT:    v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
+; SDAG-NEXT:    v_lshlrev_b16_e32 v2, 8, v23
+; SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v17
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_lshlrev_b16_e32 v15, 8, v19
+; SDAG-NEXT:    v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
+; SDAG-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dwordx4 v[3:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v31, 8
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_mov_b32_e32 v32, 0xff
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v32, v1
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GISEL-NEXT:    buffer_load_ubyte v7, off, s[0:3], s32
+; GISEL-NEXT:    v_and_or_b32 v1, v4, v32, v1
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT:    v_or3_b32 v1, v1, v4, v5
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v10
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v11
+; GISEL-NEXT:    v_and_or_b32 v2, v8, v32, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v3, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v15
+; GISEL-NEXT:    v_and_or_b32 v3, v12, v32, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; GISEL-NEXT:    v_or3_b32 v3, v3, v4, v5
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v4, v31, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v18
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v19
+; GISEL-NEXT:    v_and_or_b32 v4, v16, v32, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xff, v22
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xff, v23
+; GISEL-NEXT:    v_and_or_b32 v8, v20, v32, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; GISEL-NEXT:    v_or3_b32 v5, v8, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v27
+; GISEL-NEXT:    v_and_or_b32 v6, v24, v32, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; GISEL-NEXT:    v_or3_b32 v6, v6, v8, v9
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_and_or_b32 v8, v28, v32, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GISEL-NEXT:    v_or3_b32 v7, v8, v9, v7
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Arrays. Need to become vectors.
+
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [1 x i32], ptr addrspace(7) %p
+  ret [1 x i32] %ret
+}
+
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a1i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a1i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [1 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x i32], ptr addrspace(7) %p
+  ret [2 x i32] %ret
+}
+
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x i32] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x half], ptr addrspace(7) %p
+  ret [2 x half] %ret
+}
+
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0x5040100
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x half] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
+  ret [2 x ptr addrspace(1)] %ret
+}
+
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2p1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2p1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Scalars of atypical width. Need to be cast to vectors and split.
+
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i40:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i40:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GISEL-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i40, ptr addrspace(7) %p
+  ret i40 %ret
+}
+
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i40:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i40:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i40 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i96:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i96:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i96, ptr addrspace(7) %p
+  ret i96 %ret
+}
+
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i96:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i96:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i96 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i160:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_mov_b32 s4, s33
+; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_addk_i32 s32, 0x1800
+; SDAG-NEXT:    s_addk_i32 s32, 0xe800
+; SDAG-NEXT:    s_mov_b32 s33, s4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i160:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i160, ptr addrspace(7) %p
+  ret i160 %ret
+}
+
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i160:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, s33
+; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
+; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_addk_i32 s32, 0x1000
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_addk_i32 s32, 0xf000
+; SDAG-NEXT:    s_mov_b32 s33, s4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i160:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i160 %data, ptr addrspace(7) %p
+  ret void
+}
+
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i256:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i256:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i256, ptr addrspace(7) %p
+  ret i256 %ret
+}
+
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i256:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i256:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i256 %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Non-byte-sized scalars. Require zero-extension.
+
+define i7 @load_i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i7, ptr addrspace(7) %p
+  ret i7 %ret
+}
+
+define void @store_i4(i7 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i7 %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Byte-sized vectors of i4. Require casts.
+
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i4>, ptr addrspace(7) %p
+  ret <2 x i4> %ret
+}
+
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v4i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b16_e32 v4, 4, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v1
+; SDAG-NEXT:    v_lshrrev_b16_e32 v3, 12, v1
+; SDAG-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v4i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <4 x i4>, ptr addrspace(7) %p
+  ret <4 x i4> %ret
+}
+
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v4i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 15
+; SDAG-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 12, v3
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v4i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <4 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v8i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v7, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v7
+; SDAG-NEXT:    v_bfe_u32 v1, v7, 4, 4
+; SDAG-NEXT:    v_bfe_u32 v2, v7, 8, 4
+; SDAG-NEXT:    v_bfe_u32 v3, v7, 12, 4
+; SDAG-NEXT:    v_bfe_u32 v4, v7, 16, 4
+; SDAG-NEXT:    v_bfe_u32 v5, v7, 20, 4
+; SDAG-NEXT:    v_bfe_u32 v6, v7, 24, 4
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 28, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v8i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 20, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <8 x i4>, ptr addrspace(7) %p
+  ret <8 x i4> %ret
+}
+
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v8i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; SDAG-NEXT:    v_and_or_b32 v0, v0, 15, v1
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v3
+; SDAG-NEXT:    v_and_b32_e32 v2, 15, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT:    v_and_b32_e32 v1, 15, v5
+; SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 20, v1
+; SDAG-NEXT:    v_and_b32_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    v_or3_b32 v0, v0, v3, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 28, v7
+; SDAG-NEXT:    v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v8i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GISEL-NEXT:    v_and_or_b32 v0, v0, 15, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 15, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 15, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    v_mov_b32_e32 v1, 15
+; GISEL-NEXT:    v_and_b32_e32 v3, 15, v5
+; GISEL-NEXT:    v_and_b32_sdwa v2, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
+; GISEL-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GISEL-NEXT:    v_and_b32_e32 v2, 15, v7
+; GISEL-NEXT:    v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 28, v2
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <8 x i4> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Vectors of non-byte-sized integers.
+
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v2i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v0, 63, v1
+; SDAG-NEXT:    v_bfe_u32 v1, v1, 6, 6
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v2i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b16_e32 v1, 6, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <2 x i6>, ptr addrspace(7) %p
+  ret <2 x i6> %ret
+}
+
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v2i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 63, v0
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v2i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 63, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 63, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <2 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;; Blocks of fp6 elements
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_v32i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_v32i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <32 x i6>, ptr addrspace(7) %p
+  %ret.cast = bitcast <32 x i6> %ret to <6 x i32>
+  ret <6 x i32> %ret.cast
+}
+
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_v32i6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_v32i6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %data = bitcast <6 x i32> %data.abi to <32 x i6>
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <32 x i6> %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Modifiers
+
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <4 x i8>, ptr addrspace(7) %p
+  ret <4 x i8> %ret
+}
+
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v4i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v4i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xff
+; GISEL-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v1
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <4 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
+
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_load_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:4 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v7
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_load_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load volatile <6 x i8>, ptr addrspace(7) %p
+  ret <6 x i8> %ret
+}
+
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: volatile_store_v6i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: volatile_store_v6i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v3
+; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store volatile <6 x i8> %data, ptr addrspace(7) %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 6f0d51a0277380..7e768982ba4286 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -91,7 +91,12 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32
 ; CHECK-NEXT:    [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160
 ; CHECK-NEXT:    [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[V_INT]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT:    [[V_INT_CAST:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
+; CHECK-NEXT:    [[V_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
+; CHECK-NEXT:    [[V_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_CAST]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg)
@@ -104,7 +109,7 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
 ; CHECK-SAME: ({ ptr addrspace(8), i32 } noundef [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
 ; CHECK-NEXT:    [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
-; CHECK-NEXT:    [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[ARG_OFF]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
 ; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP2]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index 5b225636b120a4..c821d0abfc1f5e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
 
+; Note: if you're adding tests here, also add them to
+; buffer-fat-pointers-contents-legalization.ll to make sure the output of this
+; transformation can codegen.
+
 target triple = "amdgcn--"
 
 ;;; Legal types. These are natively supported, no casts should be performed.
@@ -118,7 +122,8 @@ define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
 define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <1 x i32> @load_v1i32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
 ; CHECK-NEXT:    ret <1 x i32> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -129,7 +134,8 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
 define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v1i32(
 ; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -758,7 +764,8 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
 define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x half> @load_v6f16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
 ; CHECK-NEXT:    ret <6 x half> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -769,7 +776,8 @@ define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
 define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v6f16(
 ; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -782,7 +790,11 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
 define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <5 x float> @load_v5f32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x float> poison, <5 x float> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_PART_4]], i64 4
 ; CHECK-NEXT:    ret <5 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -793,7 +805,10 @@ define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
 define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v5f32(
 ; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x float> [[DATA]], <5 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x float> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -804,7 +819,12 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
 define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x float> @load_v6f32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x float> poison, <6 x float> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_PART_4]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x float> [[RET_PARTS_0]], <6 x float> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -815,7 +835,10 @@ define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
 define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v6f32(
 ; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -826,7 +849,12 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
 define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <7 x float> @load_v7f32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x float> poison, <7 x float> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_PART_4]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <7 x float> [[RET_PARTS_0]], <7 x float> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 9>
 ; CHECK-NEXT:    ret <7 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -837,7 +865,10 @@ define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
 define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v7f32(
 ; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -848,7 +879,12 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
 define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <8 x float> @load_v8f32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x float> poison, <8 x float> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <8 x float> [[RET_PARTS_0]], <8 x float> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -859,7 +895,10 @@ define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
 define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v8f32(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -870,7 +909,15 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
 define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <10 x float> @load_v10f32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <10 x float> poison, <10 x float> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <10 x float> [[RET_PARTS_0]], <10 x float> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
+; CHECK-NEXT:    [[RET_PART_8:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_PART_8]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <10 x float> [[RET_PARTS_4]], <10 x float> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
 ; CHECK-NEXT:    ret <10 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -881,7 +928,12 @@ define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
 define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v10f32(
 ; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_8:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <2 x i32> <i32 8, i32 9>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA_SLICE_8]], ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -892,7 +944,12 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
 define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v6i32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x i32> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -903,7 +960,10 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
 define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v6i32(
 ; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -914,7 +974,12 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
 define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <4 x ptr addrspace(1)> poison, <4 x ptr addrspace(1)> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[RET_PART_2:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_2]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x ptr addrspace(1)> [[RET_PARTS_0]], <4 x ptr addrspace(1)> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x ptr addrspace(1)> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -925,7 +990,10 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
 define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v4p1(
 ; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_2]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -938,7 +1006,8 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
 define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <1 x i16> @load_v1i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <1 x i16>
 ; CHECK-NEXT:    ret <1 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -949,7 +1018,8 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
 define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v1i16(
 ; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i16> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_STORABLE]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -960,7 +1030,11 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
 define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <3 x i16> @load_v3i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
 ; CHECK-NEXT:    ret <3 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -971,7 +1045,10 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
 define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v3i16(
 ; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -982,7 +1059,11 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
 define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <5 x i16> @load_v5i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_PART_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_PART_4]], i64 4
 ; CHECK-NEXT:    ret <5 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -993,7 +1074,10 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
 define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v5i16(
 ; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1004,7 +1088,8 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
 define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x i16> @load_v6i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
 ; CHECK-NEXT:    ret <6 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1015,7 +1100,8 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
 define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v6i16(
 ; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1026,7 +1112,12 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
 define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <7 x i16> @load_v7i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_PART_0]] to <6 x i16>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_PART_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
+; CHECK-NEXT:    [[RET_PART_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_PART_6]], i64 6
 ; CHECK-NEXT:    ret <7 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1037,7 +1128,11 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
 define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v7i16(
 ; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_6:%.*]] = extractelement <7 x i16> [[DATA]], i64 6
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_6]], ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1048,7 +1143,11 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
 define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <9 x i16> @load_v9i16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_PART_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[RET_PART_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_PART_8]], i64 8
 ; CHECK-NEXT:    ret <9 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1059,7 +1158,10 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
 define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v9i16(
 ; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_8]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1074,7 +1176,8 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
 define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <1 x i8> @load_v1i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <1 x i8>
 ; CHECK-NEXT:    ret <1 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1085,7 +1188,8 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
 define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v1i8(
 ; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <1 x i8> [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1096,7 +1200,8 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
 define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <2 x i8> @load_v2i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1107,7 +1212,8 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
 define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v2i8(
 ; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1118,7 +1224,12 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
 define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <3 x i8> @load_v3i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_0]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_PART_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_PART_2]], i64 2
 ; CHECK-NEXT:    ret <3 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1129,7 +1240,11 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
 define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v3i8(
 ; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i8> [[DATA]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_2]], ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1140,7 +1255,8 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
 define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <4 x i8> @load_v4i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1151,7 +1267,8 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
 define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v4i8(
 ; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1162,7 +1279,12 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
 define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <5 x i8> @load_v5i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
 ; CHECK-NEXT:    ret <5 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1173,7 +1295,11 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
 define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v5i8(
 ; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1184,7 +1310,12 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
 define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x i8> @load_v6i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1195,7 +1326,11 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
 define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v6i8(
 ; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1206,7 +1341,16 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
 define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <7 x i8> @load_v7i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_4]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_PART_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
+; CHECK-NEXT:    [[RET_PART_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_PART_6]], i64 6
 ; CHECK-NEXT:    ret <7 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1217,7 +1361,14 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
 define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v7i8(
 ; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_4_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_4]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_4_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_6:%.*]] = extractelement <7 x i8> [[DATA]], i64 6
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_6]], ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1228,7 +1379,8 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
 define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <8 x i8> @load_v8i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
 ; CHECK-NEXT:    ret <8 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1239,7 +1391,8 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
 define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v8i8(
 ; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1250,7 +1403,8 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
 define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <12 x i8> @load_v12i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
 ; CHECK-NEXT:    ret <12 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1261,7 +1415,8 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
 define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v12i8(
 ; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1272,7 +1427,8 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
 define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <16 x i8> @load_v16i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1283,7 +1439,8 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
 define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v16i8(
 ; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1294,7 +1451,13 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
 define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <32 x i8> @load_v32i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
 ; CHECK-NEXT:    ret <32 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1305,7 +1468,11 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
 define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v32i8(
 ; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1318,7 +1485,10 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
 define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define [1 x i32] @load_a1i32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <1 x i32> [[RET_FROM_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [1 x i32] poison, i32 [[RET_ELEM_0]], 0
 ; CHECK-NEXT:    ret [1 x i32] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1329,7 +1499,10 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
 define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_a1i32(
 ; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [1 x i32] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <1 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA_AS_VEC_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1340,7 +1513,11 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
 define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define [2 x i32] @load_a2i32(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x i32] [[RET_AS_ARRAY_0]], i32 [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1351,7 +1528,11 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
 define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_a2i32(
 ; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA_AS_VEC_0]], i32 [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1362,7 +1543,11 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
 define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define [2 x half] @load_a2f16(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x half] [[RET_AS_ARRAY_0]], half [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x half] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1373,7 +1558,11 @@ define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
 define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_a2f16(
 ; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x half] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x half> [[DATA_AS_VEC_0]], half [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA_AS_VEC_1]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1384,7 +1573,11 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
 define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
+; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
+; CHECK-NEXT:    [[RET_ELEM_1:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x ptr addrspace(1)] [[RET_AS_ARRAY_0]], ptr addrspace(1) [[RET_ELEM_1]], 1
 ; CHECK-NEXT:    ret [2 x ptr addrspace(1)] [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1395,7 +1588,11 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
 define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_a2p1(
 ; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 0
+; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
+; CHECK-NEXT:    [[DATA_AS_VEC_1:%.*]] = insertelement <2 x ptr addrspace(1)> [[DATA_AS_VEC_0]], ptr addrspace(1) [[DATA_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_AS_VEC_1]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1408,7 +1605,13 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
 define i40 @load_i40(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define i40 @load_i40(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i8> [[RET_SLICE_4]] to i40
 ; CHECK-NEXT:    ret i40 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1419,7 +1622,12 @@ define i40 @load_i40(ptr addrspace(8) %buf) {
 define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_i40(
 ; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i40 [[DATA]] to <5 x i8>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA_LEGAL]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i8> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_SLICE_4]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1430,7 +1638,8 @@ define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
 define i96 @load_i96(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define i96 @load_i96(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to i96
 ; CHECK-NEXT:    ret i96 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1441,7 +1650,8 @@ define i96 @load_i96(ptr addrspace(8) %buf) {
 define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_i96(
 ; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i96 [[DATA]] to <3 x i32>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1452,7 +1662,12 @@ define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
 define i160 @load_i160(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define i160 @load_i160(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i32> poison, <5 x i32> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i32> [[RET_SLICE_4]] to i160
 ; CHECK-NEXT:    ret i160 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1463,7 +1678,11 @@ define i160 @load_i160(ptr addrspace(8) %buf) {
 define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_i160(
 ; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i160 [[DATA]] to <5 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i32> [[DATA_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i32> [[DATA_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1474,7 +1693,13 @@ define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
 define i256 @load_i256(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define i256 @load_i256(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to i256
 ; CHECK-NEXT:    ret i256 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1485,7 +1710,11 @@ define i256 @load_i256(ptr addrspace(8) %buf) {
 define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_i256(
 ; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i256 [[DATA]] to <8 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1498,7 +1727,8 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
 define i7 @load_i4(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define i7 @load_i4(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
 ; CHECK-NEXT:    ret i7 [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1509,7 +1739,8 @@ define i7 @load_i4(ptr addrspace(8) %buf) {
 define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_i4(
 ; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1522,7 +1753,8 @@ define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
 define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <2 x i4> @load_v2i4(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <2 x i4>
 ; CHECK-NEXT:    ret <2 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1533,7 +1765,8 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
 define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v2i4(
 ; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i4> [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1544,7 +1777,8 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
 define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <4 x i4> @load_v4i4(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
 ; CHECK-NEXT:    ret <4 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1555,7 +1789,8 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
 define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v4i4(
 ; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1566,7 +1801,8 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
 define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <8 x i4> @load_v8i4(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
 ; CHECK-NEXT:    ret <8 x i4> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1577,7 +1813,8 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
 define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v8i4(
 ; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1590,7 +1827,9 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
 define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <2 x i6> @load_v2i6(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_TRUNC:%.*]] = trunc i16 [[RET_LOADABLE]] to i12
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i12 [[RET_TRUNC]] to <2 x i6>
 ; CHECK-NEXT:    ret <2 x i6> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1601,7 +1840,9 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
 define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v2i6(
 ; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_AS_SCALAR:%.*]] = bitcast <2 x i6> [[DATA]] to i12
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i12 [[DATA_AS_SCALAR]] to i16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_ZEXT]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1613,7 +1854,13 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
 define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v32i6(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
 ; CHECK-NEXT:    [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
 ; CHECK-NEXT:    ret <6 x i32> [[RET_CAST]]
 ;
@@ -1627,7 +1874,11 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @store_v32i6(
 ; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %data = bitcast <6 x i32> %data.abi to <32 x i6>
@@ -1641,7 +1892,8 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
 define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1652,7 +1904,8 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
 define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @volatile_store_v4i8(
 ; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1663,7 +1916,12 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
 define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1674,7 +1932,11 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
 define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
 ; CHECK-LABEL: define void @volatile_store_v6i8(
 ; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA_LEGAL]], i64 2
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_2]], ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    ret void
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 90fc3cf3d72ea3..4b47380e7cf145 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -54,7 +54,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[BUF_PTR_4_PTR_INT]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], i64 4, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
 ; CHECK-NEXT:    ret float [[RET]], !dbg [[DBG34:![0-9]+]]
 ;
   %buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20

>From e40eba94b1c61f33e78a891d112e4ac5cdba438e Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 2 Oct 2024 14:54:30 +0000
Subject: [PATCH 02/11] Update tests

---
 ...ffer-fat-pointers-contents-legalization.ll | 73 +++++++++++++++++--
 ...ffer-fat-pointers-contents-legalization.ll | 32 +++++++-
 2 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index b8d01c12b5b180..3e8e5f58ca06ff 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Note: if you're adding tests here, also add them to
 ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
@@ -4260,8 +4260,8 @@ define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
 
 ;;; Non-byte-sized scalars. Require zero-extension.
 
-define i7 @load_i4(ptr addrspace(8) inreg %buf) {
-; SDAG-LABEL: load_i4:
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s11, s17
@@ -4272,7 +4272,7 @@ define i7 @load_i4(ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: load_i4:
+; GISEL-LABEL: load_i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, s6
@@ -4287,8 +4287,8 @@ define i7 @load_i4(ptr addrspace(8) inreg %buf) {
   ret i7 %ret
 }
 
-define void @store_i4(i7 %data, ptr addrspace(8) inreg %buf) {
-; SDAG-LABEL: store_i4:
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s11, s17
@@ -4300,7 +4300,7 @@ define void @store_i4(i7 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: store_i4:
+; GISEL-LABEL: store_i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, s6
@@ -4316,6 +4316,63 @@ define void @store_i4(i7 %data, ptr addrspace(8) inreg %buf) {
   ret void
 }
 
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i4, ptr addrspace(7) %p
+  ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_i4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_i4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i4 %data, ptr addrspace(7) %p
+  ret void
+}
+
+
 ;;; Byte-sized vectors of i4. Require casts.
 
 define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index c821d0abfc1f5e..26572813e9b1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1724,8 +1724,8 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
 
 ;;; Non-byte-sized scalars. Require zero-extension.
 
-define i7 @load_i4(ptr addrspace(8) %buf) {
-; CHECK-LABEL: define i7 @load_i4(
+define i7 @load_i7(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i7 @load_i7(
 ; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
@@ -1736,8 +1736,8 @@ define i7 @load_i4(ptr addrspace(8) %buf) {
   ret i7 %ret
 }
 
-define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
-; CHECK-LABEL: define void @store_i4(
+define void @store_i7(i7 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i7(
 ; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
@@ -1748,6 +1748,30 @@ define void @store_i4(i7 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
+define i4 @load_i4(ptr addrspace(8) %buf) {
+; CHECK-LABEL: define i4 @load_i4(
+; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i4
+; CHECK-NEXT:    ret i4 [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load i4, ptr addrspace(7) %p
+  ret i4 %ret
+}
+
+define void @store_i4(i4 %data, ptr addrspace(8) %buf) {
+; CHECK-LABEL: define void @store_i4(
+; CHECK-SAME: i4 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i4 [[DATA]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store i4 %data, ptr addrspace(7) %p
+  ret void
+}
+
 ;;; Byte-sized vectors of i4. Require casts.
 
 define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {

>From e6f04fb0b80c26a3d30d40af058aafd1e7756bf9 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 3 Oct 2024 17:29:04 +0000
Subject: [PATCH 03/11] Address review feedback re CreateBitCast, handle
 arbitrary aggregates

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   | 415 +++++++++++-------
 1 file changed, 262 insertions(+), 153 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 831474c192526f..8e035382dfce06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -66,26 +66,6 @@
 // Atomics operations on `ptr addrspace(7)` values are not suppported, as the
 // hardware does not include a 160-bit atomic.
 //
-// ## Type remapping
-//
-// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
-// to the corresponding struct type, which has a resource part and an offset
-// part.
-//
-// This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer`
-// to, usually by way of `setType`ing values. Constants are handled here
-// because there isn't a good way to fix them up later.
-//
-// This has the downside of leaving the IR in an invalid state (for example,
-// the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist),
-// but all such invalid states will be resolved by the third phase.
-//
-// Functions that don't take buffer fat pointers are modified in place. Those
-// that do take such pointers have their basic blocks moved to a new function
-// with arguments that are {ptr addrspace(8), i32} arguments and return values.
-// This phase also records intrinsics so that they can be remangled or deleted
-// later.
-//
 // ## Buffer contents type legalization
 //
 // The underlying buffer intrinsics only support types up to 128 bits long,
@@ -98,6 +78,8 @@
 //
 // This involves a combination of
 // - Converting arrays to vectors where possible
+// - Otherwise, splitting loads and stores of aggregates into loads/stores of
+//   each component.
 // - Zero-extending things to fill a whole number of bytes
 // - Casting values of types that don't neatly correspond to supported machine
 // value
@@ -106,6 +88,26 @@
 // - Splitting values that are too long (such as aforementioned <8 x i32>) into
 //   multiple operations.
 //
+// ## Type remapping
+//
+// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers
+// to the corresponding struct type, which has a resource part and an offset
+// part.
+//
+// This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer`
+// to, usually by way of `setType`ing values. Constants are handled here
+// because there isn't a good way to fix them up later.
+//
+// This has the downside of leaving the IR in an invalid state (for example,
+// the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist),
+// but all such invalid states will be resolved by the third phase.
+//
+// Functions that don't take buffer fat pointers are modified in place. Those
+// that do take such pointers have their basic blocks moved to a new function
+// with arguments that are {ptr addrspace(8), i32} arguments and return values.
+// This phase also records intrinsics so that they can be remangled or deleted
+// later.
+//
 // ## Splitting pointer structs
 //
 // The meat of this pass consists of defining semantics for operations that
@@ -600,11 +602,13 @@ namespace {
 /// one ore more such loads/stores that consist of legal types.
 ///
 /// Do this by
-/// 1. Converting arrays of non-aggregate, byte-sized types into their
+/// 1. Recursing into structs (and arrays that don't share a memory layout with
+/// vectors) since the intrinsics can't handle complex types.
+/// 2. Converting arrays of non-aggregate, byte-sized types into their
 /// correspondinng vectors
-/// 2. Bitcasting unsupported types, namely overly-long scalars and byte
+/// 3. Bitcasting unsupported types, namely overly-long scalars and byte
 /// vectors, into vectors of supported types.
-/// 3. Splitting up excessively long reads/writes into multiple operations.
+/// 4. Splitting up excessively long reads/writes into multiple operations.
 ///
 /// Note that this doesn't handle complex data strucures, but, in the future,
 /// the aggregate load splitter from SROA could be refactored to allow for that
@@ -620,8 +624,8 @@ class LegalizeBufferContentTypesVisitor
   /// If T is [N x U], where U is a scalar type, return the vector type
   /// <N x U>, otherwise, return T.
   Type *scalarArrayTypeAsVector(Type *MaybeArrayType);
-  Value *arrayToVector(Value *V, Type *TargetType, StringRef Name);
-  Value *vectorToArray(Value *V, Type *OrigType, StringRef Name);
+  Value *arrayToVector(Value *V, Type *TargetType, const Twine &Name);
+  Value *vectorToArray(Value *V, Type *OrigType, const Twine &Name);
 
   /// Break up the loads of a struct into the loads of its components
 
@@ -629,22 +633,22 @@ class LegalizeBufferContentTypesVisitor
   /// intrinsics to one that would be legal through bitcasts and/or truncation.
   /// Uses the wider of i32, i16, or i8 where possible.
   Type *legalNonAggregateFor(Type *T);
-  Value *makeLegalNonAggregate(Value *V, Type *TargetType, StringRef Name);
-  Value *makeIllegalNonAggregate(Value *V, Type *OrigType, StringRef Name);
+  Value *makeLegalNonAggregate(Value *V, Type *TargetType, const Twine &Name);
+  Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
 
-  struct Slice {
-    unsigned Offset;
-    unsigned Length;
-    Slice(unsigned Offset, unsigned Length) : Offset(Offset), Length(Length) {}
+  struct VecSlice {
+    uint64_t Index;
+    uint64_t Length;
+    VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {}
   };
-  // Return the [offset, length] pairs into which `T` needs to be cut to form
+  // Return the [index, length] pairs into which `T` needs to be cut to form
   // legal buffer load or store operations. Clears `Slices`. Creates an empty
   // `Slices` for non-vector inputs and creates one slice if no slicing will be
   // needed.
-  void getSlices(Type *T, SmallVectorImpl<Slice> &Slices);
+  void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
 
-  Value *extractSlice(Value *Vec, Slice S, StringRef Name);
-  Value *insertSlice(Value *Whole, Value *Part, Slice S, StringRef Name);
+  Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
+  Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
 
   // In most cases, return `LegalType`. However, when given an input that would
   // normally be a legal type for the buffer intrinsics to return but that isn't
@@ -656,6 +660,15 @@ class LegalizeBufferContentTypesVisitor
   // i32>
   Type *intrinsicTypeFor(Type *LegalType);
 
+  bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
+                     SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
+                     Value *&Result, const Twine &Name);
+  // Return value is (Changed, ModifiedInPlace)
+  std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
+                                       SmallVectorImpl<uint32_t> &AggIdxs,
+                                       uint64_t AggByteOffset,
+                                       const Twine &Name);
+
   bool visitInstruction(Instruction &I) { return false; }
   bool visitLoadInst(LoadInst &LI);
   bool visitStoreInst(StoreInst &SI);
@@ -673,17 +686,17 @@ Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
     return T;
   Type *ET = AT->getElementType();
   if (!ET->isSingleValueType() || isa<VectorType>(ET))
-    report_fatal_error(
-        "loading non-scalar arrays from buffer fat pointers is unimplemented");
+    report_fatal_error("loading non-scalar arrays from buffer fat pointers "
+                       "should have recursed");
   if (!DL.typeSizeEqualsStoreSize(AT))
     report_fatal_error(
-        "loading padded arrays from buffer fat pinters is unimplemented");
+        "loading padded arrays from buffer fat pinters should have recursed");
   return FixedVectorType::get(ET, AT->getNumElements());
 }
 
 Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
                                                         Type *TargetType,
-                                                        StringRef Name) {
+                                                        const Twine &Name) {
   Value *VectorRes = PoisonValue::get(TargetType);
   auto *VT = cast<FixedVectorType>(TargetType);
   unsigned EC = VT->getNumElements();
@@ -697,7 +710,7 @@ Value *LegalizeBufferContentTypesVisitor::arrayToVector(Value *V,
 
 Value *LegalizeBufferContentTypesVisitor::vectorToArray(Value *V,
                                                         Type *OrigType,
-                                                        StringRef Name) {
+                                                        const Twine &Name) {
   Value *ArrayRes = PoisonValue::get(OrigType);
   ArrayType *AT = cast<ArrayType>(OrigType);
   unsigned EC = AT->getNumElements();
@@ -714,13 +727,14 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
   // Implicitly zero-extend to the next byte if needed
   if (!DL.typeSizeEqualsStoreSize(T))
     T = IRB.getIntNTy(Size.getFixedValue());
-  auto *VT = dyn_cast<VectorType>(T);
   Type *ElemTy = T;
-  if (VT) {
+  if (auto *VT = dyn_cast<FixedVectorType>(T)) {
     ElemTy = VT->getElementType();
   }
-  if (isa<PointerType>(ElemTy))
-    return T; // Pointers are always big enough
+  if (isa<PointerType, ScalableVectorType>(ElemTy))
+    // Pointers are always big enough, and scalable vectors shouldn't crash the
+    // pass.
+    return T;
   unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
   if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
     // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
@@ -742,7 +756,7 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
 }
 
 Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
-    Value *V, Type *TargetType, StringRef Name) {
+    Value *V, Type *TargetType, const Twine &Name) {
   Type *SourceType = V->getType();
   if (DL.getTypeSizeInBits(SourceType) != DL.getTypeSizeInBits(TargetType)) {
     Type *ShortScalarTy =
@@ -754,13 +768,11 @@ Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
     V = Zext;
     SourceType = ByteScalarTy;
   }
-  if (SourceType == TargetType)
-    return V;
   return IRB.CreateBitCast(V, TargetType, Name + ".legal");
 }
 
 Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
-    Value *V, Type *OrigType, StringRef Name) {
+    Value *V, Type *OrigType, const Twine &Name) {
   Type *LegalType = V->getType();
   if (DL.getTypeSizeInBits(LegalType) != DL.getTypeSizeInBits(OrigType)) {
     Type *ShortScalarTy =
@@ -773,8 +785,6 @@ Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
       return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
     return Trunc;
   }
-  if (LegalType == OrigType)
-    return V;
   return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
 }
 
@@ -806,58 +816,63 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
   return LegalType;
 }
 
-void LegalizeBufferContentTypesVisitor::getSlices(
-    Type *T, SmallVectorImpl<Slice> &Slices) {
+void LegalizeBufferContentTypesVisitor::getVecSlices(
+    Type *T, SmallVectorImpl<VecSlice> &Slices) {
   Slices.clear();
   auto *VT = dyn_cast<FixedVectorType>(T);
   if (!VT)
     return;
 
-  unsigned ElemBitWidth =
+  uint64_t ElemBitWidth =
       DL.getTypeSizeInBits(VT->getElementType()).getFixedValue();
 
-  unsigned ElemsPer4Words = 128 / ElemBitWidth;
-  unsigned ElemsPer2Words = ElemsPer4Words / 2;
-  unsigned ElemsPerWord = ElemsPer2Words / 2;
-  unsigned ElemsPerShort = ElemsPerWord / 2;
-  unsigned ElemsPerByte = ElemsPerShort / 2;
+  uint64_t ElemsPer4Words = 128 / ElemBitWidth;
+  uint64_t ElemsPer2Words = ElemsPer4Words / 2;
+  uint64_t ElemsPerWord = ElemsPer2Words / 2;
+  uint64_t ElemsPerShort = ElemsPerWord / 2;
+  uint64_t ElemsPerByte = ElemsPerShort / 2;
   // If the elements evenly pack into 32-bit words, we can use 3-word stores,
   // such as for <6 x bfloat> or <3 x i32>, but we can't dot his for, for
   // example, <3 x i64>, since that's not slicing.
-  unsigned ElemsPer3Words = ElemsPerWord * 3;
+  uint64_t ElemsPer3Words = ElemsPerWord * 3;
 
-  unsigned TotalElems = VT->getNumElements();
-  unsigned Off = 0;
+  uint64_t TotalElems = VT->getNumElements();
+  uint64_t Index = 0;
   auto TrySlice = [&](unsigned MaybeLen) {
-    if (MaybeLen > 0 && Off + MaybeLen <= TotalElems) {
-      Slices.emplace_back(/*Offset=*/Off, /*Length=*/MaybeLen);
-      Off += MaybeLen;
+    if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
+      Slices.emplace_back(/*Index=*/Index, /*Length=*/MaybeLen);
+      Index += MaybeLen;
       return true;
     }
     return false;
   };
-  while (Off < TotalElems) {
+  while (Index < TotalElems) {
     TrySlice(ElemsPer4Words) || TrySlice(ElemsPer3Words) ||
         TrySlice(ElemsPer2Words) || TrySlice(ElemsPerWord) ||
         TrySlice(ElemsPerShort) || TrySlice(ElemsPerByte);
   }
 }
 
-Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, Slice S,
-                                                       StringRef Name) {
+Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
+                                                       const Twine &Name) {
+  if (!isa<FixedVectorType>(Vec->getType()))
+    return Vec;
   if (S.Length == 1)
-    return IRB.CreateExtractElement(Vec, S.Offset,
-                                    Name + ".slice." + Twine(S.Offset));
-  SmallVector<int> Mask = llvm::to_vector(llvm::iota_range<int>(
-      S.Offset, S.Offset + S.Length, /*Inclusive=*/false));
-  return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Offset));
+    return IRB.CreateExtractElement(Vec, S.Index,
+                                    Name + ".slice." + Twine(S.Index));
+  SmallVector<int> Mask = llvm::to_vector(
+      llvm::iota_range<int>(S.Index, S.Index + S.Length, /*Inclusive=*/false));
+  return IRB.CreateShuffleVector(Vec, Mask, Name + ".slice." + Twine(S.Index));
 }
 
 Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
-                                                      Slice S, StringRef Name) {
+                                                      VecSlice S,
+                                                      const Twine &Name) {
+  if (!isa<FixedVectorType>(Whole->getType()))
+    return Part;
   if (S.Length == 1) {
-    return IRB.CreateInsertElement(Whole, Part, S.Offset,
-                                   Name + ".slice." + Twine(S.Offset));
+    return IRB.CreateInsertElement(Whole, Part, S.Index,
+                                   Name + ".slice." + Twine(S.Index));
   }
   int NumElems = cast<FixedVectorType>(Whole->getType())->getNumElements();
 
@@ -868,102 +883,184 @@ Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
     E = I;
   }
   Value *ExtPart = IRB.CreateShuffleVector(Part, ExtPartMask,
-                                           Name + ".ext." + Twine(S.Offset));
+                                           Name + ".ext." + Twine(S.Index));
 
   SmallVector<int> Mask =
       llvm::to_vector(llvm::iota_range<int>(0, NumElems, /*Inclusive=*/false));
   for (auto [I, E] :
-       llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Offset, S.Length)))
+       llvm::enumerate(MutableArrayRef<int>(Mask).slice(S.Index, S.Length)))
     E = I + NumElems;
   return IRB.CreateShuffleVector(Whole, ExtPart, Mask,
-                                 Name + ".parts." + Twine(S.Offset));
-}
+                                 Name + ".parts." + Twine(S.Index));
+}
+
+bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
+    LoadInst &OrigLI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, Value *&Result, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                               AggByteOff + Offset.getKnownMinValue(), Result,
+                               Name + "." + Twine(I));
+      AggIdxs.pop_back();
+    }
+    return Changed;
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    TypeSize AllocSize = DL.getTypeAllocSizeInBits(ElemTy);
+    if (!(ElemTy->isSingleValueType() &&
+          DL.getTypeSizeInBits(ElemTy) == AllocSize && !ElemTy->isVectorTy())) {
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
+                                 AggByteOff + I * AllocSize.getKnownMinValue(),
+                                 Result, Name + Twine(I));
+        AggIdxs.pop_back();
+      }
+      return Changed;
+    }
+  }
 
-bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
-  if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
-    return false;
-  Type *OrigType = LI.getType();
-  Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
+  // Typical case
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
   Type *LegalType = legalNonAggregateFor(ArrayAsVecType);
 
-  SmallVector<Slice> Slices;
-  getSlices(LegalType, Slices);
-  bool NeedToSplit = Slices.size() > 1;
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool HasSlices = Slices.size() > 1;
+  bool IsAggPart = !AggIdxs.empty();
   Value *LoadsRes;
-  StringRef Name = LI.getName();
-  if (!NeedToSplit) {
+  if (!HasSlices && !IsAggPart) {
     Type *LoadableType = intrinsicTypeFor(LegalType);
-    if (LoadableType == OrigType)
+    if (LoadableType == PartType)
       return false;
 
-    IRB.SetInsertPoint(&LI);
-    auto *NLI = cast<LoadInst>(LI.clone());
+    IRB.SetInsertPoint(&OrigLI);
+    auto *NLI = cast<LoadInst>(OrigLI.clone());
     NLI->mutateType(LoadableType);
     NLI = IRB.Insert(NLI);
     NLI->setName(Name + ".loadable");
 
-    LoadsRes = NLI;
-    if (LoadableType != LegalType) {
-      LoadsRes =
-          IRB.CreateBitCast(LoadsRes, LegalType, Name + ".from.loadable");
-    }
+    LoadsRes = IRB.CreateBitCast(NLI, LegalType, Name + ".from.loadable");
   } else {
-    IRB.SetInsertPoint(&LI);
+    IRB.SetInsertPoint(&OrigLI);
     LoadsRes = PoisonValue::get(LegalType);
-    Value *OrigPtr = LI.getPointerOperand();
+    Value *OrigPtr = OrigLI.getPointerOperand();
     // If we're needing to spill something into more than one load, its legal
     // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
-    Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+    // But if we're already a scalar (which can happen if we're splitting up a
+    // struct), the element type will be the legal type itself.
+    Type *ElemType = LegalType;
+    if (auto *VT = dyn_cast<FixedVectorType>(LegalType))
+      ElemType = VT->getElementType();
     unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
-    AAMDNodes AANodes = LI.getAAMetadata();
-    for (Slice S : Slices) {
+    AAMDNodes AANodes = OrigLI.getAAMetadata();
+    if (IsAggPart && Slices.empty())
+      Slices.emplace_back(/*Index=*/0, /*Length=*/1);
+    for (VecSlice S : Slices) {
       Type *SliceType =
           S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
-      unsigned ByteOffset = S.Offset * ElemBytes;
+      int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
       // You can't reasonably expect loads to wrap around the edge of memory.
       Value *NewPtr = IRB.CreateGEP(
-          IRB.getInt8Ty(), LI.getPointerOperand(), IRB.getInt32(ByteOffset),
-          OrigPtr->getName() + ".part.ptr." + Twine(S.Offset),
+          IRB.getInt8Ty(), OrigLI.getPointerOperand(), IRB.getInt32(ByteOffset),
+          OrigPtr->getName() + ".off.ptr." + Twine(ByteOffset),
           GEPNoWrapFlags::noUnsignedWrap());
       Type *LoadableType = intrinsicTypeFor(SliceType);
       LoadInst *NewLI = IRB.CreateAlignedLoad(
-          LoadableType, NewPtr, commonAlignment(LI.getAlign(), ByteOffset),
-          Name + ".part." + Twine(S.Offset));
-      copyMetadataForLoad(*NewLI, LI);
+          LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
+          Name + ".off." + Twine(ByteOffset));
+      copyMetadataForLoad(*NewLI, OrigLI);
       NewLI->setAAMetadata(
           AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
-      if (LI.isAtomic())
-        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-      if (LI.isVolatile())
-        NewLI->setVolatile(LI.isVolatile());
-      Value *Loaded = NewLI;
-      if (LoadableType != SliceType)
-        Loaded = IRB.CreateBitCast(NewLI, SliceType,
-                                   NewLI->getName() + ".from.loadable");
+      if (OrigLI.isAtomic())
+        NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
+      if (OrigLI.isVolatile())
+        NewLI->setVolatile(OrigLI.isVolatile());
+      Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
+                                        NewLI->getName() + ".from.loadable");
       LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
     }
   }
   if (LegalType != ArrayAsVecType)
     LoadsRes = makeIllegalNonAggregate(LoadsRes, ArrayAsVecType, Name);
-  if (ArrayAsVecType != OrigType)
-    LoadsRes = vectorToArray(LoadsRes, OrigType, Name);
-  LoadsRes->takeName(&LI);
-  LI.replaceAllUsesWith(LoadsRes);
-  LI.eraseFromParent();
+  if (ArrayAsVecType != PartType)
+    LoadsRes = vectorToArray(LoadsRes, PartType, Name);
+
+  if (IsAggPart)
+    Result = IRB.CreateInsertValue(Result, LoadsRes, AggIdxs, Name);
+  else
+    Result = LoadsRes;
   return true;
 }
 
-bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
-  if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+bool LegalizeBufferContentTypesVisitor::visitLoadInst(LoadInst &LI) {
+  if (LI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
     return false;
-  IRB.SetInsertPoint(&SI);
-  Value *OrigData = SI.getValueOperand();
-  Type *OrigType = OrigData->getType();
-  StringRef Name = OrigData->getName();
+
+  SmallVector<uint32_t> AggIdxs;
+  Type *OrigType = LI.getType();
+  Value *Result = PoisonValue::get(OrigType);
+  bool Changed = visitLoadImpl(LI, OrigType, AggIdxs, 0, Result, LI.getName());
+  if (!Changed)
+    return false;
+  Result->takeName(&LI);
+  LI.replaceAllUsesWith(Result);
+  LI.eraseFromParent();
+  return Changed;
+}
+
+std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
+    StoreInst &OrigSI, Type *PartType, SmallVectorImpl<uint32_t> &AggIdxs,
+    uint64_t AggByteOff, const Twine &Name) {
+  if (auto *ST = dyn_cast<StructType>(PartType)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    bool Changed = false;
+    for (auto [I, ElemTy, Offset] :
+         llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
+      AggIdxs.push_back(I);
+      Changed |= std::get<0>(visitStoreImpl(
+          OrigSI, ElemTy, AggIdxs, AggByteOff + Offset.getKnownMinValue(),
+          Name + "." + Twine(I)));
+      AggIdxs.pop_back();
+    }
+    return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+  }
+  if (auto *AT = dyn_cast<ArrayType>(PartType)) {
+    Type *ElemTy = AT->getElementType();
+    TypeSize AllocSize = DL.getTypeAllocSizeInBits(ElemTy);
+    if (!(ElemTy->isSingleValueType() &&
+          DL.getTypeSizeInBits(ElemTy) == AllocSize && !ElemTy->isVectorTy())) {
+      bool Changed = false;
+      for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
+                                               /*Inclusive=*/false)) {
+        AggIdxs.push_back(I);
+        Changed |= std::get<0>(visitStoreImpl(
+            OrigSI, ElemTy, AggIdxs,
+            AggByteOff + I * AllocSize.getKnownMinValue(), Name + Twine(I)));
+        AggIdxs.pop_back();
+      }
+      return std::make_pair(Changed, /*ModifiedInPlace=*/false);
+    }
+  }
+
+  Value *OrigData = OrigSI.getValueOperand();
   Value *NewData = OrigData;
 
-  Type *ArrayAsVecType = scalarArrayTypeAsVector(OrigType);
-  if (ArrayAsVecType != OrigType) {
+  bool IsAggPart = !AggIdxs.empty();
+  if (IsAggPart)
+    NewData = IRB.CreateExtractValue(NewData, AggIdxs, Name);
+
+  Type *ArrayAsVecType = scalarArrayTypeAsVector(PartType);
+  if (ArrayAsVecType != PartType) {
     NewData = arrayToVector(NewData, ArrayAsVecType, Name);
   }
 
@@ -972,47 +1069,59 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
     NewData = makeLegalNonAggregate(NewData, LegalType, Name);
   }
 
-  SmallVector<Slice> Slices;
-  getSlices(LegalType, Slices);
-  bool NeedToSplit = Slices.size() > 1;
+  SmallVector<VecSlice> Slices;
+  getVecSlices(LegalType, Slices);
+  bool NeedToSplit = Slices.size() > 1 || IsAggPart;
   if (!NeedToSplit) {
     Type *StorableType = intrinsicTypeFor(LegalType);
-    if (StorableType == OrigType)
-      return false;
-    if (StorableType != LegalType)
-      NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
-
-    SI.setOperand(0, NewData);
-    return true;
+    if (StorableType == PartType)
+      return std::make_pair(/*Changed=*/false, /*ModifiedInPlace=*/false);
+    NewData = IRB.CreateBitCast(NewData, StorableType, Name + ".storable");
+    OrigSI.setOperand(0, NewData);
+    return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/true);
   }
 
-  Value *OrigPtr = SI.getPointerOperand();
-  Type *ElemType = cast<VectorType>(LegalType)->getElementType();
+  Value *OrigPtr = OrigSI.getPointerOperand();
+  Type *ElemType = LegalType;
+  if (auto *VT = dyn_cast<FixedVectorType>(LegalType))
+    ElemType = VT->getElementType();
+  if (IsAggPart && Slices.empty())
+    Slices.emplace_back(/*Index=*/0, /*Length=*/1);
   unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
-  AAMDNodes AANodes = SI.getAAMetadata();
-  for (Slice S : Slices) {
+  AAMDNodes AANodes = OrigSI.getAAMetadata();
+  for (VecSlice S : Slices) {
     Type *SliceType =
         S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
-    unsigned ByteOffset = S.Offset * ElemBytes;
+    int64_t ByteOffset = AggByteOff + S.Index * ElemBytes;
     Value *NewPtr =
         IRB.CreateGEP(IRB.getInt8Ty(), OrigPtr, IRB.getInt32(ByteOffset),
-                      OrigPtr->getName() + ".part." + Twine(S.Offset),
+                      OrigPtr->getName() + ".part." + Twine(S.Index),
                       GEPNoWrapFlags::noUnsignedWrap());
     Value *DataSlice = extractSlice(NewData, S, Name);
     Type *StorableType = intrinsicTypeFor(SliceType);
-    if (StorableType != SliceType) {
-      DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
-                                    DataSlice->getName() + ".storable");
-    }
-    auto *NewSI = cast<StoreInst>(SI.clone());
-    NewSI->setAlignment(commonAlignment(SI.getAlign(), ByteOffset));
+    DataSlice = IRB.CreateBitCast(DataSlice, StorableType,
+                                  DataSlice->getName() + ".storable");
+    auto *NewSI = cast<StoreInst>(OrigSI.clone());
+    NewSI->setAlignment(commonAlignment(OrigSI.getAlign(), ByteOffset));
     IRB.Insert(NewSI);
     NewSI->setOperand(0, DataSlice);
     NewSI->setOperand(1, NewPtr);
     NewSI->setAAMetadata(AANodes.adjustForAccess(ByteOffset, StorableType, DL));
   }
-  SI.eraseFromParent();
-  return true;
+  return std::make_pair(/*Changed=*/true, /*ModifiedInPlace=*/false);
+}
+
+bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
+  if (SI.getPointerAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
+    return false;
+  IRB.SetInsertPoint(&SI);
+  SmallVector<uint32_t> AggIdxs;
+  Value *OrigData = SI.getValueOperand();
+  auto [Changed, ModifiedInPlace] =
+      visitStoreImpl(SI, OrigData->getType(), AggIdxs, 0, OrigData->getName());
+  if (Changed && !ModifiedInPlace)
+    SI.eraseFromParent();
+  return Changed;
 }
 
 bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {

>From dda3f911b981bbfb4dd2f407ecbafc9032c33280 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 3 Oct 2024 19:59:15 +0000
Subject: [PATCH 04/11] Add tests for structs, arrays, scalable vectors

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |   38 +-
 ...ffer-fat-pointers-contents-legalization.ll |  398 +++++++
 ...mdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll |    6 +
 ...ffer-fat-pointers-contents-legalization.ll | 1036 ++++++++++-------
 4 files changed, 1065 insertions(+), 413 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 8e035382dfce06..75d48c0a7a606a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -732,8 +732,8 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
     ElemTy = VT->getElementType();
   }
   if (isa<PointerType, ScalableVectorType>(ElemTy))
-    // Pointers are always big enough, and scalable vectors shouldn't crash the
-    // pass.
+    // Pointers are always big enough, and we'll let scalable vectors through to
+    // fail in codegen.
     return T;
   unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
   if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
@@ -855,7 +855,10 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
 
 Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
                                                        const Twine &Name) {
-  if (!isa<FixedVectorType>(Vec->getType()))
+  auto *VecVT = dyn_cast<FixedVectorType>(Vec->getType());
+  if (!VecVT)
+    return Vec;
+  if (S.Length == VecVT->getNumElements() && S.Index == 0)
     return Vec;
   if (S.Length == 1)
     return IRB.CreateExtractElement(Vec, S.Index,
@@ -868,7 +871,10 @@ Value *LegalizeBufferContentTypesVisitor::extractSlice(Value *Vec, VecSlice S,
 Value *LegalizeBufferContentTypesVisitor::insertSlice(Value *Whole, Value *Part,
                                                       VecSlice S,
                                                       const Twine &Name) {
-  if (!isa<FixedVectorType>(Whole->getType()))
+  auto *WholeVT = dyn_cast<FixedVectorType>(Whole->getType());
+  if (!WholeVT)
+    return Part;
+  if (S.Length == WholeVT->getNumElements() && S.Index == 0)
     return Part;
   if (S.Length == 1) {
     return IRB.CreateInsertElement(Whole, Part, S.Index,
@@ -904,7 +910,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
          llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
       AggIdxs.push_back(I);
       Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
-                               AggByteOff + Offset.getKnownMinValue(), Result,
+                               AggByteOff + Offset.getFixedValue(), Result,
                                Name + "." + Twine(I));
       AggIdxs.pop_back();
     }
@@ -912,15 +918,16 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
   }
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
-    TypeSize AllocSize = DL.getTypeAllocSizeInBits(ElemTy);
+    TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
     if (!(ElemTy->isSingleValueType() &&
-          DL.getTypeSizeInBits(ElemTy) == AllocSize && !ElemTy->isVectorTy())) {
+          DL.getTypeSizeInBits(ElemTy) == 8 * AllocSize &&
+          !ElemTy->isVectorTy())) {
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
         AggIdxs.push_back(I);
         Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
-                                 AggByteOff + I * AllocSize.getKnownMinValue(),
+                                 AggByteOff + I * AllocSize.getFixedValue(),
                                  Result, Name + Twine(I));
         AggIdxs.pop_back();
       }
@@ -1027,25 +1034,26 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
     for (auto [I, ElemTy, Offset] :
          llvm::enumerate(ST->elements(), Layout->getMemberOffsets())) {
       AggIdxs.push_back(I);
-      Changed |= std::get<0>(visitStoreImpl(
-          OrigSI, ElemTy, AggIdxs, AggByteOff + Offset.getKnownMinValue(),
-          Name + "." + Twine(I)));
+      Changed |= std::get<0>(visitStoreImpl(OrigSI, ElemTy, AggIdxs,
+                                            AggByteOff + Offset.getFixedValue(),
+                                            Name + "." + Twine(I)));
       AggIdxs.pop_back();
     }
     return std::make_pair(Changed, /*ModifiedInPlace=*/false);
   }
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
-    TypeSize AllocSize = DL.getTypeAllocSizeInBits(ElemTy);
+    TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
     if (!(ElemTy->isSingleValueType() &&
-          DL.getTypeSizeInBits(ElemTy) == AllocSize && !ElemTy->isVectorTy())) {
+          DL.getTypeSizeInBits(ElemTy) == 8 * AllocSize &&
+          !ElemTy->isVectorTy())) {
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
         AggIdxs.push_back(I);
         Changed |= std::get<0>(visitStoreImpl(
-            OrigSI, ElemTy, AggIdxs,
-            AggByteOff + I * AllocSize.getKnownMinValue(), Name + Twine(I)));
+            OrigSI, ElemTy, AggIdxs, AggByteOff + I * AllocSize.getFixedValue(),
+            Name + Twine(I)));
         AggIdxs.pop_back();
       }
       return std::make_pair(Changed, /*ModifiedInPlace=*/false);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 3e8e5f58ca06ff..3974cc1054bb5c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -4926,3 +4926,401 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
   store volatile <6 x i8> %data, ptr addrspace(7) %p
   ret void
 }
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+  ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2a2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2a2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a2v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a2v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+  ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a2v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a2v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i32 }, ptr addrspace(7) %p
+  ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_sl_f32ss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_sl_f32ss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { { float } }, ptr addrspace(7) %p
+  ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_sl_f32ss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_sl_f32ss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { { float } } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_v2i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_v2i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { <2 x i32> }, ptr addrspace(7) %p
+  ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_v2i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_v2i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { <2 x i32> } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_sl_i64i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_sl_i64i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i64, i32 }, ptr addrspace(7) %p
+  ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_sl_i64i32s:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_sl_i64i32s:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i64, i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: load_a4i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: load_a4i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [4 x i7], ptr addrspace(7) %p
+  ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; SDAG-LABEL: store_a4i7:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s11, s17
+; SDAG-NEXT:    s_mov_b32 s10, s16
+; SDAG-NEXT:    s_mov_b32 s9, s7
+; SDAG-NEXT:    s_mov_b32 s8, s6
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v1
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v3
+; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:3
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: store_a4i7:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, s6
+; GISEL-NEXT:    s_mov_b32 s5, s7
+; GISEL-NEXT:    s_mov_b32 s6, s16
+; GISEL-NEXT:    s_mov_b32 s7, s17
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v1
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v3
+; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [4 x i7] %data, ptr addrspace(7) %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
new file mode 100644
index 00000000000000..2cb8489bd53dab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
@@ -0,0 +1,6 @@
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 < %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s
+define void @buffer_store_nxv2i32(ptr addrspace(8) inreg %rsrc, i32 %offset) {
+  call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> poison, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index 26572813e9b1ba..d18f0f8bd1ff93 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -9,9 +9,9 @@ target triple = "amdgcn--"
 
 ;;; Legal types. These are natively supported, no casts should be performed.
 
-define i8 @load_i8(ptr addrspace(8) %buf) {
+define i8 @load_i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i8 @load_i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i8 [[RET]]
 ;
@@ -20,9 +20,9 @@ define i8 @load_i8(ptr addrspace(8) %buf) {
   ret i8 %ret
 }
 
-define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
+define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i8(
-; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -31,9 +31,9 @@ define void @store_i8(i8 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i16 @load_i16(ptr addrspace(8) %buf) {
+define i16 @load_i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i16 @load_i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i16 [[RET]]
 ;
@@ -42,9 +42,9 @@ define i16 @load_i16(ptr addrspace(8) %buf) {
   ret i16 %ret
 }
 
-define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
+define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i16(
-; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -53,9 +53,9 @@ define void @store_i16(i16 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i32 @load_i32(ptr addrspace(8) %buf) {
+define i32 @load_i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i32 @load_i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i32 [[RET]]
 ;
@@ -64,9 +64,9 @@ define i32 @load_i32(ptr addrspace(8) %buf) {
   ret i32 %ret
 }
 
-define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
+define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i32(
-; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -75,9 +75,9 @@ define void @store_i32(i32 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i64 @load_i64(ptr addrspace(8) %buf) {
+define i64 @load_i64(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i64 @load_i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
@@ -86,9 +86,9 @@ define i64 @load_i64(ptr addrspace(8) %buf) {
   ret i64 %ret
 }
 
-define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
+define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i64(
-; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -97,9 +97,9 @@ define void @store_i64(i64 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i128 @load_i128(ptr addrspace(8) %buf) {
+define i128 @load_i128(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i128 @load_i128(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call i128 @llvm.amdgcn.raw.ptr.buffer.load.i128(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret i128 [[RET]]
 ;
@@ -108,9 +108,9 @@ define i128 @load_i128(ptr addrspace(8) %buf) {
   ret i128 %ret
 }
 
-define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
+define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i128(
-; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i128(i128 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -119,9 +119,9 @@ define void @store_i128(i128 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
+define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i32> @load_v1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
 ; CHECK-NEXT:    ret <1 x i32> [[RET]]
@@ -131,9 +131,9 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) {
   ret <1 x i32> %ret
 }
 
-define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i32(
-; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -143,9 +143,9 @@ define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
+define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i32> @load_v2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i32> [[RET]]
 ;
@@ -154,9 +154,9 @@ define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) {
   ret <2 x i32> %ret
 }
 
-define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i32(
-; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -165,9 +165,9 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
+define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i32> @load_v3i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
@@ -176,9 +176,9 @@ define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) {
   ret <3 x i32> %ret
 }
 
-define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i32(
-; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -187,9 +187,9 @@ define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
+define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i32> @load_v4i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
@@ -198,9 +198,9 @@ define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) {
   ret <4 x i32> %ret
 }
 
-define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i32(
-; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -209,9 +209,9 @@ define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
+define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i16> @load_v2i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i16> [[RET]]
 ;
@@ -220,9 +220,9 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) {
   ret <2 x i16> %ret
 }
 
-define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i16(
-; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -231,9 +231,9 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
+define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i16> @load_v4i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x i16> [[RET]]
 ;
@@ -242,9 +242,9 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) {
   ret <4 x i16> %ret
 }
 
-define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i16(
-; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -253,9 +253,9 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
+define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i16> @load_v8i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x i16> [[RET]]
 ;
@@ -264,9 +264,9 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) {
   ret <8 x i16> %ret
 }
 
-define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i16(
-; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -275,9 +275,9 @@ define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
+define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i64> @load_v2i64(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x i64> [[RET]]
 ;
@@ -286,9 +286,9 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) {
   ret <2 x i64> %ret
 }
 
-define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
+define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i64(
-; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -297,9 +297,9 @@ define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define half @load_f16(ptr addrspace(8) %buf) {
+define half @load_f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define half @load_f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret half [[RET]]
 ;
@@ -308,9 +308,9 @@ define half @load_f16(ptr addrspace(8) %buf) {
   ret half %ret
 }
 
-define void @store_f16(half %data, ptr addrspace(8) %buf) {
+define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_f16(
-; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -319,9 +319,9 @@ define void @store_f16(half %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define bfloat @load_bf16(ptr addrspace(8) %buf) {
+define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define bfloat @load_bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret bfloat [[RET]]
 ;
@@ -330,9 +330,9 @@ define bfloat @load_bf16(ptr addrspace(8) %buf) {
   ret bfloat %ret
 }
 
-define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
+define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_bf16(
-; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -341,9 +341,9 @@ define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
+define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x half> @load_v2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x half> [[RET]]
 ;
@@ -352,9 +352,9 @@ define <2 x half> @load_v2f16(ptr addrspace(8) %buf) {
   ret <2 x half> %ret
 }
 
-define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2f16(
-; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -363,9 +363,9 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
+define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x bfloat> @load_v4bf16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x bfloat> [[RET]]
 ;
@@ -374,9 +374,9 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) {
   ret <4 x bfloat> %ret
 }
 
-define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
+define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4bf16(
-; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -385,9 +385,9 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
+define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x half> @load_v8f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x half> [[RET]]
 ;
@@ -396,9 +396,9 @@ define <8 x half> @load_v8f16(ptr addrspace(8) %buf) {
   ret <8 x half> %ret
 }
 
-define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8f16(
-; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -407,9 +407,9 @@ define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define float @load_f32(ptr addrspace(8) %buf) {
+define float @load_f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define float @load_f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret float [[RET]]
 ;
@@ -418,9 +418,9 @@ define float @load_f32(ptr addrspace(8) %buf) {
   ret float %ret
 }
 
-define void @store_f32(float %data, ptr addrspace(8) %buf) {
+define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_f32(
-; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -429,9 +429,9 @@ define void @store_f32(float %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
+define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x float> @load_v2f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
@@ -440,9 +440,9 @@ define <2 x float> @load_v2f32(ptr addrspace(8) %buf) {
   ret <2 x float> %ret
 }
 
-define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2f32(
-; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -451,9 +451,9 @@ define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
+define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x float> @load_v3f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x float> [[RET]]
 ;
@@ -462,9 +462,9 @@ define <3 x float> @load_v3f32(ptr addrspace(8) %buf) {
   ret <3 x float> %ret
 }
 
-define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3f32(
-; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -473,9 +473,9 @@ define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
+define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x float> @load_v4f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x float> [[RET]]
 ;
@@ -484,9 +484,9 @@ define <4 x float> @load_v4f32(ptr addrspace(8) %buf) {
   ret <4 x float> %ret
 }
 
-define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4f32(
-; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -495,9 +495,9 @@ define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
+define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr @load_p0(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr [[RET]]
 ;
@@ -506,9 +506,9 @@ define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) {
   ret ptr addrspace(0) %ret
 }
 
-define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
+define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p0(
-; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -517,9 +517,9 @@ define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
+define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(1) @load_p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[RET]]
 ;
@@ -528,9 +528,9 @@ define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) {
   ret ptr addrspace(1) %ret
 }
 
-define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
+define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p1(
-; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -539,9 +539,9 @@ define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
+define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(2) @load_p2(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(2) [[RET]]
 ;
@@ -550,9 +550,9 @@ define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) {
   ret ptr addrspace(2) %ret
 }
 
-define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
+define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p2(
-; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p2(ptr addrspace(2) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -561,9 +561,9 @@ define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
+define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(3) @load_p3(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(3) [[RET]]
 ;
@@ -572,9 +572,9 @@ define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) {
   ret ptr addrspace(3) %ret
 }
 
-define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
+define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p3(
-; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p3(ptr addrspace(3) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -583,9 +583,9 @@ define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
+define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(4) @load_p4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(4) [[RET]]
 ;
@@ -594,9 +594,9 @@ define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) {
   ret ptr addrspace(4) %ret
 }
 
-define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
+define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p4(
-; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -605,9 +605,9 @@ define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
+define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(5) @load_p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(5) [[RET]]
 ;
@@ -616,9 +616,9 @@ define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) {
   ret ptr addrspace(5) %ret
 }
 
-define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
+define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p5(
-; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -627,9 +627,9 @@ define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
+define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(6) @load_p6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(6) @llvm.amdgcn.raw.ptr.buffer.load.p6(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(6) [[RET]]
 ;
@@ -638,9 +638,9 @@ define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) {
   ret ptr addrspace(6) %ret
 }
 
-define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
+define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p6(
-; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p6(ptr addrspace(6) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -649,9 +649,9 @@ define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
+define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define ptr addrspace(8) @load_p8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.raw.ptr.buffer.load.p8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(8) [[RET]]
 ;
@@ -660,9 +660,9 @@ define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) {
   ret ptr addrspace(8) %ret
 }
 
-define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
+define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_p8(
-; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.p8(ptr addrspace(8) [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -671,9 +671,9 @@ define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x ptr addrspace(1)> @load_v2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[RET]]
 ;
@@ -682,9 +682,9 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) {
   ret <2 x ptr addrspace(1)> %ret
 }
 
-define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2p1(
-; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -693,9 +693,9 @@ define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
+define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x ptr addrspace(5)> @load_v2p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <2 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v2p5(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <2 x ptr addrspace(5)> [[RET]]
 ;
@@ -704,9 +704,9 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) {
   ret <2 x ptr addrspace(5)> %ret
 }
 
-define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2p5(
-; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p5(<2 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -715,9 +715,9 @@ define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
+define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x ptr addrspace(5)> @load_v3p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <3 x ptr addrspace(5)> [[RET]]
 ;
@@ -726,9 +726,9 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) {
   ret <3 x ptr addrspace(5)> %ret
 }
 
-define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3p5(
-; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -737,9 +737,9 @@ define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x ptr addrspace(5)> @load_v4p5(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET:%.*]] = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret <4 x ptr addrspace(5)> [[RET]]
 ;
@@ -748,9 +748,9 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) {
   ret <4 x ptr addrspace(5)> %ret
 }
 
-define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4p5(
-; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -761,9 +761,9 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) {
 
 ;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported.
 
-define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
+define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x half> @load_v6f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x half>
 ; CHECK-NEXT:    ret <6 x half> [[RET]]
@@ -773,9 +773,9 @@ define <6 x half> @load_v6f16(ptr addrspace(8) %buf) {
   ret <6 x half> %ret
 }
 
-define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
+define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6f16(
-; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x half> [[DATA]] to <3 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -787,14 +787,14 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) {
 
 ;;; Long types (32 bit elements). Must be split into multiple operations.
 
-define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
+define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x float> @load_v5f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x float> poison, <5 x float> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x float> [[RET_PARTS_0]], float [[RET_OFF_16]], i64 4
 ; CHECK-NEXT:    ret <5 x float> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -802,9 +802,9 @@ define <5 x float> @load_v5f32(ptr addrspace(8) %buf) {
   ret <5 x float> %ret
 }
 
-define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5f32(
-; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x float> [[DATA]], <5 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x float> [[DATA]], i64 4
@@ -816,14 +816,14 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
+define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x float> @load_v6f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x float> poison, <6 x float> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_PART_4]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x float> [[RET_OFF_16]], <2 x float> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x float> [[RET_PARTS_0]], <6 x float> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x float> [[RET]]
 ;
@@ -832,9 +832,9 @@ define <6 x float> @load_v6f32(ptr addrspace(8) %buf) {
   ret <6 x float> %ret
 }
 
-define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6f32(
-; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x float> [[DATA]], <6 x float> poison, <2 x i32> <i32 4, i32 5>
@@ -846,14 +846,14 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
+define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x float> @load_v7f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x float> poison, <7 x float> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_PART_4]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <3 x float> [[RET_OFF_16]], <3 x float> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <7 x float> [[RET_PARTS_0]], <7 x float> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 9>
 ; CHECK-NEXT:    ret <7 x float> [[RET]]
 ;
@@ -862,9 +862,9 @@ define <7 x float> @load_v7f32(ptr addrspace(8) %buf) {
   ret <7 x float> %ret
 }
 
-define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7f32(
-; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <7 x float> [[DATA]], <7 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
@@ -876,14 +876,14 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
+define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x float> @load_v8f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x float> poison, <8 x float> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <8 x float> [[RET_PARTS_0]], <8 x float> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[RET]]
 ;
@@ -892,9 +892,9 @@ define <8 x float> @load_v8f32(ptr addrspace(8) %buf) {
   ret <8 x float> %ret
 }
 
-define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8f32(
-; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <8 x float> [[DATA]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -906,17 +906,17 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
+define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <10 x float> @load_v10f32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_PART_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x float> [[RET_OFF_0]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <10 x float> poison, <10 x float> [[RET_EXT_0]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_PART_4]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x float> [[RET_OFF_16]], <4 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <10 x float> [[RET_PARTS_0]], <10 x float> [[RET_EXT_4]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 12, i32 13, i32 8, i32 9>
-; CHECK-NEXT:    [[RET_PART_8:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_PART_8]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_32:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 32 [[BUF]], i32 32, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_8:%.*]] = shufflevector <2 x float> [[RET_OFF_32]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <10 x float> [[RET_PARTS_4]], <10 x float> [[RET_EXT_8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
 ; CHECK-NEXT:    ret <10 x float> [[RET]]
 ;
@@ -925,9 +925,9 @@ define <10 x float> @load_v10f32(ptr addrspace(8) %buf) {
   ret <10 x float> %ret
 }
 
-define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
+define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v10f32(
-; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA_SLICE_0]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <10 x float> [[DATA]], <10 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -941,14 +941,14 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v6i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    ret <6 x i32> [[RET]]
 ;
@@ -957,9 +957,9 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) {
   ret <6 x i32> %ret
 }
 
-define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
+define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i32(
-; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
@@ -971,14 +971,14 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
+define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_0]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <4 x ptr addrspace(1)> poison, <4 x ptr addrspace(1)> [[RET_EXT_0]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[RET_PART_2:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_PART_2]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_2:%.*]] = shufflevector <2 x ptr addrspace(1)> [[RET_OFF_16]], <2 x ptr addrspace(1)> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x ptr addrspace(1)> [[RET_PARTS_0]], <4 x ptr addrspace(1)> [[RET_EXT_2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x ptr addrspace(1)> [[RET]]
 ;
@@ -987,9 +987,9 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) {
   ret <4 x ptr addrspace(1)> %ret
 }
 
-define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
+define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4p1(
-; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = shufflevector <4 x ptr addrspace(1)> [[DATA]], <4 x ptr addrspace(1)> poison, <2 x i32> <i32 2, i32 3>
@@ -1003,9 +1003,9 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) {
 
 ;;; Uneven types with 16-bit elements. Require splitting into multiple operations.
 
-define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
+define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i16> @load_v1i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <1 x i16>
 ; CHECK-NEXT:    ret <1 x i16> [[RET]]
@@ -1015,9 +1015,9 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) {
   ret <1 x i16> %ret
 }
 
-define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i16(
-; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i16> [[DATA]] to i16
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_STORABLE]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1027,14 +1027,14 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
+define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i16> @load_v3i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
 ; CHECK-NEXT:    ret <3 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1042,9 +1042,9 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) {
   ret <3 x i16> %ret
 }
 
-define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i16(
-; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_2:%.*]] = extractelement <3 x i16> [[DATA]], i64 2
@@ -1056,14 +1056,14 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
+define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x i16> @load_v5i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_PART_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i16> [[RET_OFF_0]], <4 x i16> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i16> poison, <5 x i16> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET_OFF_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_8]], i64 4
 ; CHECK-NEXT:    ret <5 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1071,9 +1071,9 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) {
   ret <5 x i16> %ret
 }
 
-define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5i16(
-; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i16> [[DATA]], <5 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = extractelement <5 x i16> [[DATA]], i64 4
@@ -1085,9 +1085,9 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
+define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i16> @load_v6i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <6 x i16>
 ; CHECK-NEXT:    ret <6 x i16> [[RET]]
@@ -1097,9 +1097,9 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) {
   ret <6 x i16> %ret
 }
 
-define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i16(
-; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <6 x i16> [[DATA]] to <3 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1109,15 +1109,15 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
+define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x i16> @load_v7i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_PART_0]] to <6 x i16>
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_PART_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast <3 x i32> [[RET_OFF_0]] to <6 x i16>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <6 x i16> [[RET_OFF_0_FROM_LOADABLE]], <6 x i16> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i16> poison, <7 x i16> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6>
-; CHECK-NEXT:    [[RET_PART_6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_PART_6]], i64 6
+; CHECK-NEXT:    [[RET_OFF_12:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 12, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_12]], i64 6
 ; CHECK-NEXT:    ret <7 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1125,9 +1125,9 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) {
   ret <7 x i16> %ret
 }
 
-define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7i16(
-; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i16> [[DATA]], <7 x i16> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <6 x i16> [[DATA_SLICE_0]] to <3 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
@@ -1140,14 +1140,14 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
+define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <9 x i16> @load_v9i16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_PART_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <8 x i16> [[RET_OFF_0]], <8 x i16> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <9 x i16> poison, <9 x i16> [[RET_EXT_0]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
-; CHECK-NEXT:    [[RET_PART_8:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_PART_8]], i64 8
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <9 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_16]], i64 8
 ; CHECK-NEXT:    ret <9 x i16> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1155,9 +1155,9 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) {
   ret <9 x i16> %ret
 }
 
-define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
+define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v9i16(
-; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <9 x i16> [[DATA]], <9 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[DATA_SLICE_8:%.*]] = extractelement <9 x i16> [[DATA]], i64 8
@@ -1173,9 +1173,9 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) {
 ;;; - Split into multiple operations
 ;;; - Bitcast if they have a natively supported width
 
-define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
+define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <1 x i8> @load_v1i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <1 x i8>
 ; CHECK-NEXT:    ret <1 x i8> [[RET]]
@@ -1185,9 +1185,9 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) {
   ret <1 x i8> %ret
 }
 
-define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v1i8(
-; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <1 x i8> [[DATA]] to i8
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1197,9 +1197,9 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
+define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i8> @load_v2i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
@@ -1209,9 +1209,9 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) {
   ret <2 x i8> %ret
 }
 
-define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i8(
-; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i8> [[DATA]] to i16
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1221,15 +1221,15 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
+define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <3 x i8> @load_v3i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_0]] to <2 x i8>
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_PART_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_0]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i8> [[RET_OFF_0_FROM_LOADABLE]], <2 x i8> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i8> poison, <3 x i8> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <3 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_2]], i64 2
 ; CHECK-NEXT:    ret <3 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1237,9 +1237,9 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) {
   ret <3 x i8> %ret
 }
 
-define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v3i8(
-; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i8> [[DATA]], <3 x i8> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <2 x i8> [[DATA_SLICE_0]] to i16
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
@@ -1252,9 +1252,9 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i8> @load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
@@ -1264,9 +1264,9 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) {
   ret <4 x i8> %ret
 }
 
-define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1276,15 +1276,15 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
+define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <5 x i8> @load_v5i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
 ; CHECK-NEXT:    ret <5 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1292,9 +1292,9 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) {
   ret <5 x i8> %ret
 }
 
-define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v5i8(
-; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
@@ -1307,14 +1307,14 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i8> @load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
@@ -1323,9 +1323,9 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) {
   ret <6 x i8> %ret
 }
 
-define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
@@ -1338,19 +1338,19 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
+define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <7 x i8> @load_v7i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <7 x i8> poison, <7 x i8> [[RET_EXT_0]], <7 x i32> <i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_PART_4]] to <2 x i8>
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_PART_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_4_FROM_LOADABLE:%.*]] = bitcast i16 [[RET_OFF_4]] to <2 x i8>
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i8> [[RET_OFF_4_FROM_LOADABLE]], <2 x i8> poison, <7 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <7 x i8> [[RET_PARTS_0]], <7 x i8> [[RET_EXT_4]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 8, i32 6>
-; CHECK-NEXT:    [[RET_PART_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
-; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_PART_6]], i64 6
+; CHECK-NEXT:    [[RET_OFF_6:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 2 [[BUF]], i32 6, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertelement <7 x i8> [[RET_PARTS_4]], i8 [[RET_OFF_6]], i64 6
 ; CHECK-NEXT:    ret <7 x i8> [[RET]]
 ;
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1358,9 +1358,9 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) {
   ret <7 x i8> %ret
 }
 
-define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v7i8(
-; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <7 x i8> [[DATA]], <7 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_SLICE_0_STORABLE]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
@@ -1376,9 +1376,9 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
+define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i8> @load_v8i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <2 x i32> [[RET_LOADABLE]] to <8 x i8>
 ; CHECK-NEXT:    ret <8 x i8> [[RET]]
@@ -1388,9 +1388,9 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) {
   ret <8 x i8> %ret
 }
 
-define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i8(
-; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i8> [[DATA]] to <2 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1400,9 +1400,9 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
+define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <12 x i8> @load_v12i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to <12 x i8>
 ; CHECK-NEXT:    ret <12 x i8> [[RET]]
@@ -1412,9 +1412,9 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) {
   ret <12 x i8> %ret
 }
 
-define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v12i8(
-; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <12 x i8> [[DATA]] to <3 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1424,9 +1424,9 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
+define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <16 x i8> @load_v16i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <4 x i32> [[RET_LOADABLE]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[RET]]
@@ -1436,9 +1436,9 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) {
   ret <16 x i8> %ret
 }
 
-define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v16i8(
-; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <16 x i8> [[DATA]] to <4 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1448,14 +1448,14 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
+define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <32 x i8> @load_v32i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to <32 x i8>
 ; CHECK-NEXT:    ret <32 x i8> [[RET]]
@@ -1465,9 +1465,9 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) {
   ret <32 x i8> %ret
 }
 
-define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
+define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i8(
-; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i8> [[DATA]] to <8 x i32>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
@@ -1482,9 +1482,9 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) {
 
 ;;; Arrays. Need to become vectors.
 
-define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
+define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [1 x i32] @load_a1i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_LOADABLE]] to <1 x i32>
 ; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <1 x i32> [[RET_FROM_LOADABLE]], i64 0
@@ -1496,9 +1496,9 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) {
   ret [1 x i32] %ret
 }
 
-define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a1i32(
-; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [1 x i32] [[DATA]], 0
 ; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <1 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
 ; CHECK-NEXT:    [[DATA_STORABLE:%.*]] = bitcast <1 x i32> [[DATA_AS_VEC_0]] to i32
@@ -1510,9 +1510,9 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
+define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x i32] @load_a2i32(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x i32> [[RET_LOADABLE]], i64 0
 ; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET_ELEM_0]], 0
@@ -1525,9 +1525,9 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) {
   ret [2 x i32] %ret
 }
 
-define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
+define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2i32(
-; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA]], 0
 ; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA_ELEM_0]], i64 0
 ; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA]], 1
@@ -1540,9 +1540,9 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
+define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x half] @load_a2f16(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x half> [[RET_LOADABLE]], i64 0
 ; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x half] poison, half [[RET_ELEM_0]], 0
@@ -1555,9 +1555,9 @@ define [2 x half] @load_a2f16(ptr addrspace(8) %buf) {
   ret [2 x half] %ret
 }
 
-define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
+define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2f16(
-; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x half] [[DATA]], 0
 ; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x half> poison, half [[DATA_ELEM_0]], i64 0
 ; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x half] [[DATA]], 1
@@ -1570,9 +1570,9 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
+define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET_ELEM_0:%.*]] = extractelement <2 x ptr addrspace(1)> [[RET_LOADABLE]], i64 0
 ; CHECK-NEXT:    [[RET_AS_ARRAY_0:%.*]] = insertvalue [2 x ptr addrspace(1)] poison, ptr addrspace(1) [[RET_ELEM_0]], 0
@@ -1585,9 +1585,9 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) {
   ret [2 x ptr addrspace(1)] %ret
 }
 
-define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
+define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_a2p1(
-; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ELEM_0:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 0
 ; CHECK-NEXT:    [[DATA_AS_VEC_0:%.*]] = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) [[DATA_ELEM_0]], i64 0
 ; CHECK-NEXT:    [[DATA_ELEM_1:%.*]] = extractvalue [2 x ptr addrspace(1)] [[DATA]], 1
@@ -1602,15 +1602,15 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) {
 
 ;;; Scalars of atypical width. Need to be cast to vectors and split.
 
-define i40 @load_i40(ptr addrspace(8) %buf) {
+define i40 @load_i40(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i40 @load_i40(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_PART_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_PART_0]] to <4 x i8>
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_PART_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_OFF_0_FROM_LOADABLE:%.*]] = bitcast i32 [[RET_OFF_0]] to <4 x i8>
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i8> [[RET_OFF_0_FROM_LOADABLE]], <4 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i8> poison, <5 x i8> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i8> [[RET_PARTS_0]], i8 [[RET_OFF_4]], i64 4
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i8> [[RET_SLICE_4]] to i40
 ; CHECK-NEXT:    ret i40 [[RET]]
 ;
@@ -1619,9 +1619,9 @@ define i40 @load_i40(ptr addrspace(8) %buf) {
   ret i40 %ret
 }
 
-define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
+define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i40(
-; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i40 [[DATA]] to <5 x i8>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i8> [[DATA_LEGAL]], <5 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[DATA_SLICE_0_STORABLE:%.*]] = bitcast <4 x i8> [[DATA_SLICE_0]] to i32
@@ -1635,9 +1635,9 @@ define void @store_i40(i40 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i96 @load_i96(ptr addrspace(8) %buf) {
+define i96 @load_i96(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i96 @load_i96(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i32> [[RET_LOADABLE]] to i96
 ; CHECK-NEXT:    ret i96 [[RET]]
@@ -1647,9 +1647,9 @@ define i96 @load_i96(ptr addrspace(8) %buf) {
   ret i96 %ret
 }
 
-define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
+define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i96(
-; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i96 [[DATA]] to <3 x i32>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA_LEGAL]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1659,14 +1659,14 @@ define void @store_i96(i96 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i160 @load_i160(ptr addrspace(8) %buf) {
+define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i160 @load_i160(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <5 x i32> poison, <5 x i32> [[RET_EXT_0]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_PART_4]], i64 4
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_SLICE_4:%.*]] = insertelement <5 x i32> [[RET_PARTS_0]], i32 [[RET_OFF_16]], i64 4
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <5 x i32> [[RET_SLICE_4]] to i160
 ; CHECK-NEXT:    ret i160 [[RET]]
 ;
@@ -1675,9 +1675,9 @@ define i160 @load_i160(ptr addrspace(8) %buf) {
   ret i160 %ret
 }
 
-define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
+define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i160(
-; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i160 [[DATA]] to <5 x i32>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <5 x i32> [[DATA_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
@@ -1690,14 +1690,14 @@ define void @store_i160(i160 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i256 @load_i256(ptr addrspace(8) %buf) {
+define i256 @load_i256(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i256 @load_i256(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[RET_EXT_0]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_PART_4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 8 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <4 x i32> [[RET_OFF_16]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <8 x i32> [[RET_PARTS_0]], <8 x i32> [[RET_EXT_4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <8 x i32> [[RET_PARTS_4]] to i256
 ; CHECK-NEXT:    ret i256 [[RET]]
@@ -1707,9 +1707,9 @@ define i256 @load_i256(ptr addrspace(8) %buf) {
   ret i256 %ret
 }
 
-define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
+define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i256(
-; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast i256 [[DATA]] to <8 x i32>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <8 x i32> [[DATA_LEGAL]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
@@ -1724,9 +1724,9 @@ define void @store_i256(i256 %data, ptr addrspace(8) %buf) {
 
 ;;; Non-byte-sized scalars. Require zero-extension.
 
-define i7 @load_i7(ptr addrspace(8) %buf) {
+define i7 @load_i7(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i7 @load_i7(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i7
 ; CHECK-NEXT:    ret i7 [[RET]]
@@ -1736,9 +1736,9 @@ define i7 @load_i7(ptr addrspace(8) %buf) {
   ret i7 %ret
 }
 
-define void @store_i7(i7 %data, ptr addrspace(8) %buf) {
+define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i7(
-; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i7 [[DATA]] to i8
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1748,9 +1748,9 @@ define void @store_i7(i7 %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define i4 @load_i4(ptr addrspace(8) %buf) {
+define i4 @load_i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define i4 @load_i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = trunc i8 [[RET_LOADABLE]] to i4
 ; CHECK-NEXT:    ret i4 [[RET]]
@@ -1760,9 +1760,9 @@ define i4 @load_i4(ptr addrspace(8) %buf) {
   ret i4 %ret
 }
 
-define void @store_i4(i4 %data, ptr addrspace(8) %buf) {
+define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_i4(
-; CHECK-SAME: i4 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i4 [[DATA]] to i8
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1774,9 +1774,9 @@ define void @store_i4(i4 %data, ptr addrspace(8) %buf) {
 
 ;;; Byte-sized vectors of i4. Require casts.
 
-define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
+define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i4> @load_v2i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i8 [[RET_LOADABLE]] to <2 x i4>
 ; CHECK-NEXT:    ret <2 x i4> [[RET]]
@@ -1786,9 +1786,9 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) {
   ret <2 x i4> %ret
 }
 
-define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i4(
-; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <2 x i4> [[DATA]] to i8
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA_LEGAL]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1798,9 +1798,9 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
+define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i4> @load_v4i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i16 [[RET_LOADABLE]] to <4 x i4>
 ; CHECK-NEXT:    ret <4 x i4> [[RET]]
@@ -1810,9 +1810,9 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) {
   ret <4 x i4> %ret
 }
 
-define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v4i4(
-; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i4> [[DATA]] to i16
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_LEGAL]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1822,9 +1822,9 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
+define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <8 x i4> @load_v8i4(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <8 x i4>
 ; CHECK-NEXT:    ret <8 x i4> [[RET]]
@@ -1834,9 +1834,9 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) {
   ret <8 x i4> %ret
 }
 
-define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
+define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v8i4(
-; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <8 x i4> [[DATA]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
@@ -1848,9 +1848,9 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) {
 
 ;;; Vectors of non-byte-sized integers.
 
-define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
+define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <2 x i6> @load_v2i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[RET_TRUNC:%.*]] = trunc i16 [[RET_LOADABLE]] to i12
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i12 [[RET_TRUNC]] to <2 x i6>
@@ -1861,9 +1861,9 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) {
   ret <2 x i6> %ret
 }
 
-define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
+define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v2i6(
-; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_AS_SCALAR:%.*]] = bitcast <2 x i6> [[DATA]] to i12
 ; CHECK-NEXT:    [[DATA_ZEXT:%.*]] = zext i12 [[DATA_AS_SCALAR]] to i16
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA_ZEXT]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0)
@@ -1875,14 +1875,14 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) {
 }
 
 ;; Blocks of fp6 elements
-define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
+define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i32> @load_v32i6(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_PART_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <4 x i32> [[RET_OFF_0]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <6 x i32> poison, <6 x i32> [[RET_EXT_0]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; CHECK-NEXT:    [[RET_PART_4:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
-; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_PART_4]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RET_OFF_16:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_EXT_4:%.*]] = shufflevector <2 x i32> [[RET_OFF_16]], <2 x i32> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_4:%.*]] = shufflevector <6 x i32> [[RET_PARTS_0]], <6 x i32> [[RET_EXT_4]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <6 x i32> [[RET_PARTS_4]] to <32 x i6>
 ; CHECK-NEXT:    [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32>
@@ -1894,9 +1894,9 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) {
   ret <6 x i32> %ret.cast
 }
 
-define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
+define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i6(
-; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1913,9 +1913,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) {
 
 ;;; Modifiers
 
-define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
+define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RET_LOADABLE:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast i32 [[RET_LOADABLE]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[RET]]
@@ -1925,9 +1925,9 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) {
   ret <4 x i8> %ret
 }
 
-define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @volatile_store_v4i8(
-; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <4 x i8> [[DATA]] to i32
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_LEGAL]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    ret void
@@ -1937,14 +1937,14 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) {
   ret void
 }
 
-define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
+define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8(
-; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[RET_PART_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
-; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_PART_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_OFF_0:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_EXT_0:%.*]] = shufflevector <2 x i16> [[RET_OFF_0]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; CHECK-NEXT:    [[RET_PARTS_0:%.*]] = shufflevector <3 x i16> poison, <3 x i16> [[RET_EXT_0]], <3 x i32> <i32 3, i32 4, i32 2>
-; CHECK-NEXT:    [[RET_PART_2:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
-; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_PART_2]], i64 2
+; CHECK-NEXT:    [[RET_OFF_4:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 4 [[BUF]], i32 4, i32 0, i32 -2147483648)
+; CHECK-NEXT:    [[RET_SLICE_2:%.*]] = insertelement <3 x i16> [[RET_PARTS_0]], i16 [[RET_OFF_4]], i64 2
 ; CHECK-NEXT:    [[RET:%.*]] = bitcast <3 x i16> [[RET_SLICE_2]] to <6 x i8>
 ; CHECK-NEXT:    ret <6 x i8> [[RET]]
 ;
@@ -1953,9 +1953,9 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) {
   ret <6 x i8> %ret
 }
 
-define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
+define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @volatile_store_v6i8(
-; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <6 x i8> [[DATA]] to <3 x i16>
 ; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <3 x i16> [[DATA_LEGAL]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA_SLICE_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648)
@@ -1967,3 +1967,243 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) {
   store volatile <6 x i8> %data, ptr addrspace(7) %p
   ret void
 }
+
+define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x [2 x i32]] @load_a2a2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0_ELEM_0:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 0
+; CHECK-NEXT:    [[RET0_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET0_ELEM_0]], 0
+; CHECK-NEXT:    [[RET0_ELEM_1:%.*]] = extractelement <2 x i32> [[RET0_OFF_0]], i64 1
+; CHECK-NEXT:    [[RET0_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET0_AS_ARRAY_0]], i32 [[RET0_ELEM_1]], 1
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x [2 x i32]] poison, [2 x i32] [[RET0_AS_ARRAY_1]], 0
+; CHECK-NEXT:    [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET1_ELEM_0:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 0
+; CHECK-NEXT:    [[RET1_AS_ARRAY_0:%.*]] = insertvalue [2 x i32] poison, i32 [[RET1_ELEM_0]], 0
+; CHECK-NEXT:    [[RET1_ELEM_1:%.*]] = extractelement <2 x i32> [[RET1_OFF_8]], i64 1
+; CHECK-NEXT:    [[RET1_AS_ARRAY_1:%.*]] = insertvalue [2 x i32] [[RET1_AS_ARRAY_0]], i32 [[RET1_ELEM_1]], 1
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x [2 x i32]] [[RET0]], [2 x i32] [[RET1_AS_ARRAY_1]], 1
+; CHECK-NEXT:    ret [2 x [2 x i32]] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x [2 x i32]], ptr addrspace(7) %p
+  ret [2 x [2 x i32]] %ret
+}
+
+define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2a2i32(
+; CHECK-SAME: [2 x [2 x i32]] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 0
+; CHECK-NEXT:    [[DATA0_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA0]], 0
+; CHECK-NEXT:    [[DATA0_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA0_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA0_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA0]], 1
+; CHECK-NEXT:    [[DATA0_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA0_AS_VEC_0]], i32 [[DATA0_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [2 x [2 x i32]] [[DATA]], 1
+; CHECK-NEXT:    [[DATA1_ELEM_0:%.*]] = extractvalue [2 x i32] [[DATA1]], 0
+; CHECK-NEXT:    [[DATA1_AS_VEC_0:%.*]] = insertelement <2 x i32> poison, i32 [[DATA1_ELEM_0]], i64 0
+; CHECK-NEXT:    [[DATA1_ELEM_1:%.*]] = extractvalue [2 x i32] [[DATA1]], 1
+; CHECK-NEXT:    [[DATA1_AS_VEC_1:%.*]] = insertelement <2 x i32> [[DATA1_AS_VEC_0]], i32 [[DATA1_ELEM_1]], i64 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1_AS_VEC_1]], ptr addrspace(8) align 4 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x [2 x i32]] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [2 x <2 x i32>] @load_a2v2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[RET0_OFF_0]], 0
+; CHECK-NEXT:    [[RET1_OFF_8:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [2 x <2 x i32>] [[RET0]], <2 x i32> [[RET1_OFF_8]], 1
+; CHECK-NEXT:    ret [2 x <2 x i32>] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [2 x <2 x i32>], ptr addrspace(7) %p
+  ret [2 x <2 x i32>] %ret
+}
+
+define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a2v2i32(
+; CHECK-SAME: [2 x <2 x i32>] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [2 x <2 x i32>] [[DATA]], 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [2 x <2 x i32>] %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i32 } @load_sl_i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { i32 } poison, i32 [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    ret { i32 } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i32 }, ptr addrspace(7) %p
+  ret { i32 } %ret
+}
+
+define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i32s(
+; CHECK-SAME: { i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { i32 } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { { float } } @load_sl_sl_f32ss(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_0_OFF_0:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { { float } } poison, float [[RET_0_0_OFF_0]], 0, 0
+; CHECK-NEXT:    ret { { float } } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { { float } }, ptr addrspace(7) %p
+  ret { { float } } %ret
+}
+
+define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_sl_f32ss(
+; CHECK-SAME: { { float } } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0_0:%.*]] = extractvalue { { float } } [[DATA]], 0, 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA_0_0]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { { float } } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { <2 x i32> } @load_sl_v2i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { <2 x i32> } poison, <2 x i32> [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    ret { <2 x i32> } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { <2 x i32> }, ptr addrspace(7) %p
+  ret { <2 x i32> } %ret
+}
+
+define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_v2i32s(
+; CHECK-SAME: { <2 x i32> } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { <2 x i32> } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { <2 x i32> } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define { i64, i32 } @load_sl_i64i32s(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET_0_OFF_0:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET_0:%.*]] = insertvalue { i64, i32 } poison, i64 [[RET_0_OFF_0]], 0
+; CHECK-NEXT:    [[RET_1_OFF_8:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue { i64, i32 } [[RET_0]], i32 [[RET_1_OFF_8]], 1
+; CHECK-NEXT:    ret { i64, i32 } [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load { i64, i32 }, ptr addrspace(7) %p
+  ret { i64, i32 } %ret
+}
+
+define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_sl_i64i32s(
+; CHECK-SAME: { i64, i32 } [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA_0:%.*]] = extractvalue { i64, i32 } [[DATA]], 0
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA_0]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA_1:%.*]] = extractvalue { i64, i32 } [[DATA]], 1
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA_1]], ptr addrspace(8) align 8 [[BUF]], i32 8, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store { i64, i32 } %data, ptr addrspace(7) %p
+  ret void
+}
+
+define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define [4 x i7] @load_a4i7(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET0_OFF_0:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[RET0_TRUNC:%.*]] = trunc i8 [[RET0_OFF_0]] to i7
+; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [4 x i7] poison, i7 [[RET0_TRUNC]], 0
+; CHECK-NEXT:    [[RET1_OFF_1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[RET1_TRUNC:%.*]] = trunc i8 [[RET1_OFF_1]] to i7
+; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [4 x i7] [[RET0]], i7 [[RET1_TRUNC]], 1
+; CHECK-NEXT:    [[RET2_OFF_2:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[RET2_TRUNC:%.*]] = trunc i8 [[RET2_OFF_2]] to i7
+; CHECK-NEXT:    [[RET2:%.*]] = insertvalue [4 x i7] [[RET1]], i7 [[RET2_TRUNC]], 2
+; CHECK-NEXT:    [[RET3_OFF_3:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT:    [[RET3_TRUNC:%.*]] = trunc i8 [[RET3_OFF_3]] to i7
+; CHECK-NEXT:    [[RET:%.*]] = insertvalue [4 x i7] [[RET2]], i7 [[RET3_TRUNC]], 3
+; CHECK-NEXT:    ret [4 x i7] [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load [4 x i7], ptr addrspace(7) %p
+  ret [4 x i7] %ret
+}
+
+define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_a4i7(
+; CHECK-SAME: [4 x i7] [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[DATA0:%.*]] = extractvalue [4 x i7] [[DATA]], 0
+; CHECK-NEXT:    [[DATA0_ZEXT:%.*]] = zext i7 [[DATA0]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA0_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA1:%.*]] = extractvalue [4 x i7] [[DATA]], 1
+; CHECK-NEXT:    [[DATA1_ZEXT:%.*]] = zext i7 [[DATA1]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA1_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA2:%.*]] = extractvalue [4 x i7] [[DATA]], 2
+; CHECK-NEXT:    [[DATA2_ZEXT:%.*]] = zext i7 [[DATA2]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA2_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[DATA3:%.*]] = extractvalue [4 x i7] [[DATA]], 3
+; CHECK-NEXT:    [[DATA3_ZEXT:%.*]] = zext i7 [[DATA3]] to i8
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA3_ZEXT]], ptr addrspace(8) align 1 [[BUF]], i32 3, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store [4 x i7] %data, ptr addrspace(7) %p
+  ret void
+}
+
+;;; Scalable vector. This isn't semantically meaningful but shouldn't crash.
+
+define <vscale x 2 x i32> @load_nxv2i32(ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define <vscale x 2 x i32> @load_nxv2i32(
+; CHECK-SAME: ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call <vscale x 2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.nxv2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[RET]]
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  %ret = load <vscale x 2 x i32>, ptr addrspace(7) %p
+  ret <vscale x 2 x i32> %ret
+}
+
+define void @store_nxv2i32(<vscale x 2 x i32> %data, ptr addrspace(8) inreg %buf) {
+; CHECK-LABEL: define void @store_nxv2i32(
+; CHECK-SAME: <vscale x 2 x i32> [[DATA:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
+  store <vscale x 2 x i32> %data, ptr addrspace(7) %p
+  ret void
+}

>From cceac3bb71c107d3ff649d107551dd2f9a00e00b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 4 Oct 2024 17:41:29 +0000
Subject: [PATCH 05/11] Missed an if around bitcast

---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 75d48c0a7a606a..1c3c32d3a716cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -781,9 +781,7 @@ Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
         IRB.getIntNTy(DL.getTypeSizeInBits(LegalType).getFixedValue());
     Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
     Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
-    if (OrigType != ShortScalarTy)
-      return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
-    return Trunc;
+    return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
   }
   return IRB.CreateBitCast(V, OrigType, Name + ".real.ty");
 }

>From 70825eb2509253008f2fc4438516f5f8ab461bc6 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 23 Oct 2024 18:47:52 +0000
Subject: [PATCH 06/11] Address review comments

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   | 49 ++++++++-----------
 ...mdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll |  8 ++-
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 1c3c32d3a716cd..01c367e2e51355 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -605,7 +605,7 @@ namespace {
 /// 1. Recursing into structs (and arrays that don't share a memory layout with
 /// vectors) since the intrinsics can't handle complex types.
 /// 2. Converting arrays of non-aggregate, byte-sized types into their
-/// correspondinng vectors
+/// corresponding vectors
 /// 3. Bitcasting unsupported types, namely overly-long scalars and byte
 /// vectors, into vectors of supported types.
 /// 4. Splitting up excessively long reads/writes into multiple operations.
@@ -727,10 +727,7 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
   // Implicitly zero-extend to the next byte if needed
   if (!DL.typeSizeEqualsStoreSize(T))
     T = IRB.getIntNTy(Size.getFixedValue());
-  Type *ElemTy = T;
-  if (auto *VT = dyn_cast<FixedVectorType>(T)) {
-    ElemTy = VT->getElementType();
-  }
+  Type *ElemTy = T->getScalarType();
   if (isa<PointerType, ScalableVectorType>(ElemTy))
     // Pointers are always big enough, and we'll let scalable vectors through to
     // fail in codegen.
@@ -758,11 +755,11 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
 Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
     Value *V, Type *TargetType, const Twine &Name) {
   Type *SourceType = V->getType();
-  if (DL.getTypeSizeInBits(SourceType) != DL.getTypeSizeInBits(TargetType)) {
-    Type *ShortScalarTy =
-        IRB.getIntNTy(DL.getTypeSizeInBits(SourceType).getFixedValue());
-    Type *ByteScalarTy =
-        IRB.getIntNTy(DL.getTypeSizeInBits(TargetType).getFixedValue());
+  TypeSize SourceSize = DL.getTypeSizeInBits(SourceType);
+  TypeSize TargetSize = DL.getTypeSizeInBits(TargetType);
+  if (SourceSize != TargetSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(SourceSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(TargetSize.getFixedValue());
     Value *AsScalar = IRB.CreateBitCast(V, ShortScalarTy, Name + ".as.scalar");
     Value *Zext = IRB.CreateZExt(AsScalar, ByteScalarTy, Name + ".zext");
     V = Zext;
@@ -774,11 +771,11 @@ Value *LegalizeBufferContentTypesVisitor::makeLegalNonAggregate(
 Value *LegalizeBufferContentTypesVisitor::makeIllegalNonAggregate(
     Value *V, Type *OrigType, const Twine &Name) {
   Type *LegalType = V->getType();
-  if (DL.getTypeSizeInBits(LegalType) != DL.getTypeSizeInBits(OrigType)) {
-    Type *ShortScalarTy =
-        IRB.getIntNTy(DL.getTypeSizeInBits(OrigType).getFixedValue());
-    Type *ByteScalarTy =
-        IRB.getIntNTy(DL.getTypeSizeInBits(LegalType).getFixedValue());
+  TypeSize LegalSize = DL.getTypeSizeInBits(LegalType);
+  TypeSize OrigSize = DL.getTypeSizeInBits(OrigType);
+  if (LegalSize != OrigSize) {
+    Type *ShortScalarTy = IRB.getIntNTy(OrigSize.getFixedValue());
+    Type *ByteScalarTy = IRB.getIntNTy(LegalSize.getFixedValue());
     Value *AsScalar = IRB.CreateBitCast(V, ByteScalarTy, Name + ".bytes.cast");
     Value *Trunc = IRB.CreateTrunc(AsScalar, ShortScalarTy, Name + ".trunc");
     return IRB.CreateBitCast(Trunc, OrigType, Name + ".orig");
@@ -791,6 +788,8 @@ Type *LegalizeBufferContentTypesVisitor::intrinsicTypeFor(Type *LegalType) {
   if (!VT)
     return LegalType;
   Type *ET = VT->getElementType();
+  // Explicitly return the element type of 1-element vectors because the
+  // underlying intrinsics don't like <1 x T> even though it's a synonym for T.
   if (VT->getNumElements() == 1)
     return ET;
   if (DL.getTypeSizeInBits(LegalType) == 96 && DL.getTypeSizeInBits(ET) < 32)
@@ -917,9 +916,8 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
     TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
-    if (!(ElemTy->isSingleValueType() &&
-          DL.getTypeSizeInBits(ElemTy) == 8 * AllocSize &&
-          !ElemTy->isVectorTy())) {
+    if (!ElemTy->isSingleValueType() ||
+        DL.getTypeSizeInBits(ElemTy) != 8 * AllocSize || ElemTy->isVectorTy()) {
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
@@ -963,9 +961,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
     // type will be a vector (ex. an i256 load will have LegalType = <8 x i32>).
     // But if we're already a scalar (which can happen if we're splitting up a
     // struct), the element type will be the legal type itself.
-    Type *ElemType = LegalType;
-    if (auto *VT = dyn_cast<FixedVectorType>(LegalType))
-      ElemType = VT->getElementType();
+    Type *ElemType = LegalType->getScalarType();
     unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
     AAMDNodes AANodes = OrigLI.getAAMetadata();
     if (IsAggPart && Slices.empty())
@@ -986,10 +982,8 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
       copyMetadataForLoad(*NewLI, OrigLI);
       NewLI->setAAMetadata(
           AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
-      if (OrigLI.isAtomic())
-        NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
-      if (OrigLI.isVolatile())
-        NewLI->setVolatile(OrigLI.isVolatile());
+      NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
+      NewLI->setVolatile(OrigLI.isVolatile());
       Value *Loaded = IRB.CreateBitCast(NewLI, SliceType,
                                         NewLI->getName() + ".from.loadable");
       LoadsRes = insertSlice(LoadsRes, Loaded, S, Name);
@@ -1042,9 +1036,8 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
     TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
-    if (!(ElemTy->isSingleValueType() &&
-          DL.getTypeSizeInBits(ElemTy) == 8 * AllocSize &&
-          !ElemTy->isVectorTy())) {
+    if (!ElemTy->isSingleValueType() ||
+        DL.getTypeSizeInBits(ElemTy) != 8 * AllocSize || ElemTy->isVectorTy()) {
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
index 2cb8489bd53dab..ca88d73fa2ccf5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
@@ -1,5 +1,9 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 < %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s
+; Note: The exact error messages aren't important here, but are included to catch
+; anything changing.
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 \
+; RUN: | grep "LLVM ERROR: Scalarization of scalable vectors is not supported."
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 \
+; RUN: | grep "LLVM ERROR: Invalid size request on a scalable vector."
 define void @buffer_store_nxv2i32(ptr addrspace(8) inreg %rsrc, i32 %offset) {
   call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> poison, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void

>From 60e71f307274c3aaa41fb9bfe7d8f4447b46ac36 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 31 Oct 2024 17:19:18 +0000
Subject: [PATCH 07/11] Remove calls to AllocSize()

---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 01c367e2e51355..95ecb3b7022735 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -915,15 +915,15 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
   }
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
-    TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
     if (!ElemTy->isSingleValueType() ||
-        DL.getTypeSizeInBits(ElemTy) != 8 * AllocSize || ElemTy->isVectorTy()) {
+        !DL.typeSizeEqualsStoreSize(ElemTy) || ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
         AggIdxs.push_back(I);
         Changed |= visitLoadImpl(OrigLI, ElemTy, AggIdxs,
-                                 AggByteOff + I * AllocSize.getFixedValue(),
+                                 AggByteOff + I * ElemStoreSize.getFixedValue(),
                                  Result, Name + Twine(I));
         AggIdxs.pop_back();
       }

>From a6defa324d9bcdf3d36634c2798b591a513ca957 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 31 Oct 2024 19:33:54 +0000
Subject: [PATCH 08/11] Fix clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 95ecb3b7022735..de5a1fa967664a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -915,8 +915,8 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
   }
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
-    if (!ElemTy->isSingleValueType() ||
-        !DL.typeSizeEqualsStoreSize(ElemTy) || ElemTy->isVectorTy()) {
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
       TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),

>From 1cb9bb370ec0705e0745b408c2040970908377ef Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 1 Nov 2024 17:12:48 +0000
Subject: [PATCH 09/11] Missed a few bits of reviwe feedback

---
 .../Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index de5a1fa967664a..56d3b539648bc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1035,16 +1035,16 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
   }
   if (auto *AT = dyn_cast<ArrayType>(PartType)) {
     Type *ElemTy = AT->getElementType();
-    TypeSize AllocSize = DL.getTypeAllocSize(ElemTy);
-    if (!ElemTy->isSingleValueType() ||
-        DL.getTypeSizeInBits(ElemTy) != 8 * AllocSize || ElemTy->isVectorTy()) {
+    if (!ElemTy->isSingleValueType() || !DL.typeSizeEqualsStoreSize(ElemTy) ||
+        ElemTy->isVectorTy()) {
+      TypeSize ElemStoreSize = DL.getTypeStoreSize(ElemTy);
       bool Changed = false;
       for (auto I : llvm::iota_range<uint32_t>(0, AT->getNumElements(),
                                                /*Inclusive=*/false)) {
         AggIdxs.push_back(I);
         Changed |= std::get<0>(visitStoreImpl(
-            OrigSI, ElemTy, AggIdxs, AggByteOff + I * AllocSize.getFixedValue(),
-            Name + Twine(I)));
+            OrigSI, ElemTy, AggIdxs,
+            AggByteOff + I * ElemStoreSize.getFixedValue(), Name + Twine(I)));
         AggIdxs.pop_back();
       }
       return std::make_pair(Changed, /*ModifiedInPlace=*/false);
@@ -1081,9 +1081,7 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
   }
 
   Value *OrigPtr = OrigSI.getPointerOperand();
-  Type *ElemType = LegalType;
-  if (auto *VT = dyn_cast<FixedVectorType>(LegalType))
-    ElemType = VT->getElementType();
+  Type *ElemType = LegalType->getScalarType();
   if (IsAggPart && Slices.empty())
     Slices.emplace_back(/*Index=*/0, /*Length=*/1);
   unsigned ElemBytes = DL.getTypeStoreSize(ElemType);

>From ec6cc8cf0d1159a40d4513d560f56430682941db Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 5 Nov 2024 18:16:11 +0000
Subject: [PATCH 10/11] Split off NUW change, fix error test, add braces

---
 .../Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 11 ++++++++---
 ...lvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll |  9 +++++----
 .../AMDGPU/lower-buffer-fat-pointers-calls.ll      | 14 +++++++-------
 ...r-buffer-fat-pointers-unoptimized-debug-data.ll | 12 ++++++------
 4 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 56d3b539648bc9..1c56093264265d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -728,10 +728,11 @@ Type *LegalizeBufferContentTypesVisitor::legalNonAggregateFor(Type *T) {
   if (!DL.typeSizeEqualsStoreSize(T))
     T = IRB.getIntNTy(Size.getFixedValue());
   Type *ElemTy = T->getScalarType();
-  if (isa<PointerType, ScalableVectorType>(ElemTy))
+  if (isa<PointerType, ScalableVectorType>(ElemTy)) {
     // Pointers are always big enough, and we'll let scalable vectors through to
     // fail in codegen.
     return T;
+  }
   unsigned ElemSize = DL.getTypeSizeInBits(ElemTy).getFixedValue();
   if (isPowerOf2_32(ElemSize) && ElemSize >= 16 && ElemSize <= 128) {
     // [vectors of] anything that's 16/32/64/128 bits can be cast and split into
@@ -1809,7 +1810,7 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   auto [Rsrc, Off] = getPtrParts(Ptr);
   const DataLayout &DL = GEP.getDataLayout();
-  bool IsNUW = GEP.hasNoUnsignedWrap();
+  bool InBounds = GEP.isInBounds();
 
   // In order to call emitGEPOffset() and thus not have to reimplement it,
   // we need the GEP result to have ptr addrspace(7) type.
@@ -1824,12 +1825,16 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return {Rsrc, Off};
   }
 
+  bool HasNonNegativeOff = false;
+  if (auto *CI = dyn_cast<ConstantInt>(OffAccum)) {
+    HasNonNegativeOff = !CI->isNegative();
+  }
   Value *NewOff;
   if (match(Off, m_Zero())) {
     NewOff = OffAccum;
   } else {
     NewOff = IRB.CreateAdd(Off, OffAccum, "",
-                           /*hasNUW=*/IsNUW,
+                           /*hasNUW=*/InBounds && HasNonNegativeOff,
                            /*hasNSW=*/false);
   }
   copyMetadata(NewOff, &GEP);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
index ca88d73fa2ccf5..a91d38a58a1e9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.nxv2i32.fail.ll
@@ -1,9 +1,10 @@
 ; Note: The exact error messages aren't important here, but are included to catch
 ; anything changing.
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 \
-; RUN: | grep "LLVM ERROR: Scalarization of scalable vectors is not supported."
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s 2>&1 \
-; RUN: | grep "LLVM ERROR: Invalid size request on a scalable vector."
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900  -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=SDAG
+; SDAG: LLVM ERROR: Scalarization of scalable vectors is not supported.
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900  -filetype=null < %s 2>&1 | FileCheck %s --check-prefix=GISEL
+; GISEL: LLVM ERROR: Invalid size request on a scalable vector.
+
 define void @buffer_store_nxv2i32(ptr addrspace(8) inreg %rsrc, i32 %offset) {
   call void @llvm.amdgcn.raw.ptr.buffer.store.nxv2i32(<vscale x 2 x i32> poison, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 7e768982ba4286..e483a122615ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -91,12 +91,12 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32
 ; CHECK-NEXT:    [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160
 ; CHECK-NEXT:    [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]]
-; CHECK-NEXT:    [[V_INT_CAST:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
-; CHECK-NEXT:    [[V_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
-; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
-; CHECK-NEXT:    [[V_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_CAST]], i64 4
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
+; CHECK-NEXT:    [[V_INT_LEGAL:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
+; CHECK-NEXT:    [[V_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
+; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add i32 [[ARG_OFF]], 16
+; CHECK-NEXT:    [[V_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_LEGAL]], i64 4
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
   %v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg)
@@ -109,7 +109,7 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
 ; CHECK-SAME: ({ ptr addrspace(8), i32 } noundef [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
 ; CHECK-NEXT:    [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
-; CHECK-NEXT:    [[RET:%.*]] = add i32 [[ARG_OFF]], 4
+; CHECK-NEXT:    [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
 ; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP2]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 4b47380e7cf145..bf59c1669d226c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -54,12 +54,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]]
-; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
-; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_CAST_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
-; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
-; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_CAST_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_CAST]], i64 4, !dbg [[DBG33]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_CAST_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
+; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], i64 4, !dbg [[DBG33]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
 ; CHECK-NEXT:    ret float [[RET]], !dbg [[DBG34:![0-9]+]]
 ;
   %buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20

>From 21d7de19a83de9af9ff20aa10d47b52145dad29a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 18 Nov 2024 18:12:19 +0000
Subject: [PATCH 11/11] Address review feedback, fix tests

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |   39 +-
 ...ffer-fat-pointers-contents-legalization.ll | 2200 ++++-------------
 .../AMDGPU/lower-buffer-fat-pointers-calls.ll |    2 +-
 ...fer-fat-pointers-unoptimized-debug-data.ll |    2 +-
 4 files changed, 458 insertions(+), 1785 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 06d485b34d972b..2b802b16cc0df3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -637,33 +637,33 @@ class LegalizeBufferContentTypesVisitor
   Value *makeIllegalNonAggregate(Value *V, Type *OrigType, const Twine &Name);
 
   struct VecSlice {
-    uint64_t Index;
-    uint64_t Length;
-    VecSlice(uint64_t Index, uint64_t Length) : Index(Index), Length(Length) {}
+    uint64_t Index = 0;
+    uint64_t Length = 0;
+    VecSlice() = delete;
   };
-  // Return the [index, length] pairs into which `T` needs to be cut to form
-  // legal buffer load or store operations. Clears `Slices`. Creates an empty
-  // `Slices` for non-vector inputs and creates one slice if no slicing will be
-  // needed.
+  /// Return the [index, length] pairs into which `T` needs to be cut to form
+  /// legal buffer load or store operations. Clears `Slices`. Creates an empty
+  /// `Slices` for non-vector inputs and creates one slice if no slicing will be
+  /// needed.
   void getVecSlices(Type *T, SmallVectorImpl<VecSlice> &Slices);
 
   Value *extractSlice(Value *Vec, VecSlice S, const Twine &Name);
   Value *insertSlice(Value *Whole, Value *Part, VecSlice S, const Twine &Name);
 
-  // In most cases, return `LegalType`. However, when given an input that would
-  // normally be a legal type for the buffer intrinsics to return but that isn't
-  // hooked up through SelectionDAG, return a type of the same width that can be
-  // used with the relevant intrinsics. Specifically, handle the cases:
-  // - <1 x T> => T for all T
-  // - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
-  // - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
-  // i32>
+  /// In most cases, return `LegalType`. However, when given an input that would
+  /// normally be a legal type for the buffer intrinsics to return but that
+  /// isn't hooked up through SelectionDAG, return a type of the same width that
+  /// can be used with the relevant intrinsics. Specifically, handle the cases:
+  /// - <1 x T> => T for all T
+  /// - <N x i8> <=> i16, i32, 2xi32, 4xi32 (as needed)
+  /// - <N x T> where T is under 32 bits and the total size is 96 bits <=> <3 x
+  /// i32>
   Type *intrinsicTypeFor(Type *LegalType);
 
   bool visitLoadImpl(LoadInst &OrigLI, Type *PartType,
                      SmallVectorImpl<uint32_t> &AggIdxs, uint64_t AggByteOffset,
                      Value *&Result, const Twine &Name);
-  // Return value is (Changed, ModifiedInPlace)
+  /// Return value is (Changed, ModifiedInPlace)
   std::pair<bool, bool> visitStoreImpl(StoreInst &OrigSI, Type *PartType,
                                        SmallVectorImpl<uint32_t> &AggIdxs,
                                        uint64_t AggByteOffset,
@@ -838,7 +838,8 @@ void LegalizeBufferContentTypesVisitor::getVecSlices(
   uint64_t Index = 0;
   auto TrySlice = [&](unsigned MaybeLen) {
     if (MaybeLen > 0 && Index + MaybeLen <= TotalElems) {
-      Slices.emplace_back(/*Index=*/Index, /*Length=*/MaybeLen);
+      VecSlice Slice{/*Index=*/Index, /*Length=*/MaybeLen};
+      Slices.push_back(Slice);
       Index += MaybeLen;
       return true;
     }
@@ -966,7 +967,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
     unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
     AAMDNodes AANodes = OrigLI.getAAMetadata();
     if (IsAggPart && Slices.empty())
-      Slices.emplace_back(/*Index=*/0, /*Length=*/1);
+      Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
     for (VecSlice S : Slices) {
       Type *SliceType =
           S.Length != 1 ? FixedVectorType::get(ElemType, S.Length) : ElemType;
@@ -1084,7 +1085,7 @@ std::pair<bool, bool> LegalizeBufferContentTypesVisitor::visitStoreImpl(
   Value *OrigPtr = OrigSI.getPointerOperand();
   Type *ElemType = LegalType->getScalarType();
   if (IsAggPart && Slices.empty())
-    Slices.emplace_back(/*Index=*/0, /*Length=*/1);
+    Slices.push_back(VecSlice{/*Index=*/0, /*Length=*/1});
   unsigned ElemBytes = DL.getTypeStoreSize(ElemType);
   AAMDNodes AANodes = OrigSI.getAAMetadata();
   for (VecSlice S : Slices) {
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 3974cc1054bb5c..4c7a4ba3a44a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -12,22 +12,14 @@ define i8 @load_i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -39,22 +31,14 @@ define void @store_i8(i8 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -66,22 +50,14 @@ define i16 @load_i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -93,22 +69,14 @@ define void @store_i16(i16 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -120,22 +88,14 @@ define i32 @load_i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -147,22 +107,14 @@ define void @store_i32(i32 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -174,22 +126,14 @@ define i64 @load_i64(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i64:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -201,22 +145,14 @@ define void @store_i64(i64 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i64:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -228,22 +164,14 @@ define i128 @load_i128(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i128:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i128:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -255,22 +183,14 @@ define void @store_i128(i128 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i128:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i128:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -282,22 +202,14 @@ define <1 x i32> @load_v1i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v1i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v1i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -309,22 +221,14 @@ define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v1i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v1i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -336,22 +240,14 @@ define <2 x i32> @load_v2i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -363,22 +259,14 @@ define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -390,22 +278,14 @@ define <3 x i32> @load_v3i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v3i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v3i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -417,22 +297,14 @@ define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v3i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v3i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -444,22 +316,14 @@ define <4 x i32> @load_v4i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -471,22 +335,14 @@ define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v4i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v4i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -498,22 +354,14 @@ define <2 x i16> @load_v2i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -525,22 +373,14 @@ define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v2i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -552,22 +392,14 @@ define <4 x i16> @load_v4i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -579,22 +411,14 @@ define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v4i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v4i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -606,22 +430,14 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v8i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v8i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -633,22 +449,14 @@ define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v8i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v8i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -660,22 +468,14 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i64:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -687,22 +487,14 @@ define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v2i64:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2i64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -714,22 +506,14 @@ define half @load_f16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -741,22 +525,14 @@ define void @store_f16(half %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -768,22 +544,14 @@ define bfloat @load_bf16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_bf16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_bf16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -795,22 +563,14 @@ define void @store_bf16(bfloat %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_bf16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_bf16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -822,22 +582,14 @@ define <2 x half> @load_v2f16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -849,22 +601,14 @@ define void @store_v2f16(<2 x half> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v2f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -876,22 +620,14 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4bf16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4bf16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -904,11 +640,7 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v4bf16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -917,13 +649,9 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GISEL-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -935,22 +663,14 @@ define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v8f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v8f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -962,22 +682,14 @@ define void @store_v8f16(<8 x half> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v8f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v8f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -989,22 +701,14 @@ define float @load_f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1016,22 +720,14 @@ define void @store_f32(float %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1043,22 +739,14 @@ define <2 x float> @load_v2f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1070,22 +758,14 @@ define void @store_v2f32(<2 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v2f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1097,22 +777,14 @@ define <3 x float> @load_v3f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v3f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v3f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1124,22 +796,14 @@ define void @store_v3f32(<3 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v3f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v3f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1151,22 +815,14 @@ define <4 x float> @load_v4f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1178,22 +834,14 @@ define void @store_v4f32(<4 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v4f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v4f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1205,22 +853,14 @@ define ptr addrspace(0) @load_p0(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p0:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1232,22 +872,14 @@ define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p0:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1259,22 +891,14 @@ define ptr addrspace(1) @load_p1(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1286,22 +910,14 @@ define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1313,22 +929,14 @@ define ptr addrspace(2) @load_p2(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p2:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p2:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1340,22 +948,14 @@ define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p2:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p2:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1367,22 +967,14 @@ define ptr addrspace(3) @load_p3(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p3:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p3:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1394,22 +986,14 @@ define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p3:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p3:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1421,22 +1005,14 @@ define ptr addrspace(4) @load_p4(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1448,22 +1024,14 @@ define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1475,22 +1043,14 @@ define ptr addrspace(5) @load_p5(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1502,22 +1062,14 @@ define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1529,22 +1081,14 @@ define ptr addrspace(6) @load_p6(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p6:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p6:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1556,22 +1100,14 @@ define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p6:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p6:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1583,22 +1119,14 @@ define ptr addrspace(8) @load_p8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_p8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_p8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1610,22 +1138,14 @@ define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_p8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_p8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1637,22 +1157,14 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1664,22 +1176,14 @@ define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_v2p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1691,22 +1195,14 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v2p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1718,22 +1214,14 @@ define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_v2p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v2p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1745,22 +1233,14 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v3p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v3p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1772,22 +1252,14 @@ define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_v3p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v3p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1799,22 +1271,14 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1826,22 +1290,14 @@ define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_v4p5:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v4p5:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1855,22 +1311,14 @@ define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v6f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v6f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1882,22 +1330,14 @@ define void @store_v6f16(<6 x half> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v6f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v6f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1911,24 +1351,16 @@ define <5 x float> @load_v5f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v5f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v5f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1940,24 +1372,16 @@ define void @store_v5f32(<5 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v5f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v5f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1969,24 +1393,16 @@ define <6 x float> @load_v6f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v6f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v6f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1998,24 +1414,16 @@ define void @store_v6f32(<6 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v6f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v6f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2027,24 +1435,16 @@ define <7 x float> @load_v7f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v7f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v7f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[4:6], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2056,24 +1456,16 @@ define void @store_v7f32(<7 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v7f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx3 v[4:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v7f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[4:6], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2085,24 +1477,16 @@ define <8 x float> @load_v8f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v8f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v8f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2114,24 +1498,16 @@ define void @store_v8f32(<8 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v8f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v8f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2143,26 +1519,18 @@ define <10 x float> @load_v10f32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v10f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SDAG-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v10f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GISEL-NEXT:    buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx2 v[8:9], off, s[16:19], 0 offset:32
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2174,26 +1542,18 @@ define void @store_v10f32(<10 x float> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v10f32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SDAG-NEXT:    buffer_store_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v10f32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GISEL-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx2 v[8:9], off, s[16:19], 0 offset:32
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2205,24 +1565,16 @@ define <6 x i32> @load_v6i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v6i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v6i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2234,24 +1586,16 @@ define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v6i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v6i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2263,24 +1607,16 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v4p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2292,24 +1628,16 @@ define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_v4p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v4p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2323,22 +1651,14 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v1i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v1i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2350,22 +1670,14 @@ define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v1i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v1i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2377,24 +1689,16 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v3i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v3i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2406,24 +1710,16 @@ define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v3i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v1, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v3i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v1, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2435,24 +1731,16 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v5i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v5i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
 ; GISEL-NEXT:    s_mov_b32 s4, 0xffff
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
@@ -2468,24 +1756,16 @@ define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v5i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v2, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:8
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v5i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2497,22 +1777,14 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v6i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v6i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2524,22 +1796,14 @@ define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v6i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v6i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2551,24 +1815,16 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v7i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v7i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
 ; GISEL-NEXT:    s_mov_b32 s4, 0xffff
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
@@ -2585,24 +1841,16 @@ define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v7i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v3, off, s[8:11], 0 offset:12
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v3, off, s[16:19], 0 offset:12
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v7i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v3, off, s[16:19], 0 offset:12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2614,24 +1862,16 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v9i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v9i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_mov_b32 s4, 0xffff
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_bfi_b32 v0, s4, v0, v0
@@ -2649,24 +1889,16 @@ define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v9i16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v9i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2682,22 +1914,14 @@ define <1 x i8> @load_v1i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v1i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v1i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2709,22 +1933,14 @@ define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v1i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v1i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2736,11 +1952,7 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2748,11 +1960,7 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v2i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2766,12 +1974,8 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2780,12 +1984,8 @@ define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2797,12 +1997,8 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v3i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -2811,12 +2007,8 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v3i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -2831,13 +2023,9 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_byte v2, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v2, off, s[16:19], 0 offset:2
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2846,13 +2034,9 @@ define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v2, off, s[16:19], 0 offset:2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2864,11 +2048,7 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -2878,11 +2058,7 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v4i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -2901,12 +2077,8 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2921,12 +2093,8 @@ define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2938,12 +2106,8 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v5i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -2954,12 +2118,8 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v5i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -2979,13 +2139,9 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_byte v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3000,13 +2156,9 @@ define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3018,12 +2170,8 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v6i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:4
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3038,12 +2186,8 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v6i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -3064,15 +2208,11 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3087,15 +2227,11 @@ define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3107,13 +2243,9 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v7i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
-; SDAG-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_load_ubyte v6, off, s[16:19], 0 offset:6
 ; SDAG-NEXT:    s_waitcnt vmcnt(2)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3126,13 +2258,9 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v7i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4
-; GISEL-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_load_ubyte v6, off, s[16:19], 0 offset:6
 ; GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3154,16 +2282,12 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v5
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:4
-; SDAG-NEXT:    buffer_store_byte v6, off, s[8:11], 0 offset:6
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0 offset:4
+; SDAG-NEXT:    buffer_store_byte v6, off, s[16:19], 0 offset:6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3178,17 +2302,13 @@ define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v5
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
-; GISEL-NEXT:    buffer_store_byte v6, off, s[4:7], 0 offset:6
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_store_byte v6, off, s[16:19], 0 offset:6
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3200,11 +2320,7 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v8i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
@@ -3219,11 +2335,7 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v8i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3251,13 +2363,9 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dwordx2 v[3:4], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[3:4], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3279,12 +2387,8 @@ define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_or_b32 v1, v4, v8, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v1, v1, v2, v3
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3296,11 +2400,7 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v12i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v8, v2
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
@@ -3320,11 +2420,7 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v12i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
@@ -3361,14 +2457,10 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[6:8], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3397,12 +2489,8 @@ define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_or_b32 v2, v8, v12, v2
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3414,11 +2502,7 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v16i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b64 v[18:19], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
@@ -3443,11 +2527,7 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v16i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -3493,15 +2573,11 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dwordx4 v[9:12], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[9:12], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3537,12 +2613,8 @@ define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_or_b32 v3, v12, v16, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v3, v3, v4, v5
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3554,12 +2626,8 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v32i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[33:36], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx4 v[48:51], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[33:36], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[48:51], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[33:34]
 ; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[35:36]
@@ -3599,12 +2667,8 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v32i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -3679,10 +2743,6 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v2, 8, v23
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v3, 8, v17
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v15, 8, v19
 ; SDAG-NEXT:    v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -3691,7 +2751,7 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3700,7 +2760,7 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dwordx4 v[3:6], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3761,17 +2821,13 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_or3_b32 v6, v6, v8, v9
 ; GISEL-NEXT:    v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GISEL-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_and_or_b32 v8, v28, v32, v8
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; GISEL-NEXT:    v_or3_b32 v7, v8, v9, v7
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3785,22 +2841,14 @@ define [1 x i32] @load_a1i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a1i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a1i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3812,22 +2860,14 @@ define void @store_a1i32([1 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_a1i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a1i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3839,22 +2879,14 @@ define [2 x i32] @load_a2i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3866,22 +2898,14 @@ define void @store_a2i32([2 x i32] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_a2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3893,11 +2917,7 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a2f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3905,11 +2925,7 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_a2f16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3923,12 +2939,8 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0x5040100
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_perm_b32 v0, v1, v0, s4
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3936,12 +2948,8 @@ define void @store_a2f16([2 x half] %data, ptr addrspace(8) inreg %buf) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3953,22 +2961,14 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a2p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a2p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3980,22 +2980,14 @@ define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) inreg %bu
 ; SDAG-LABEL: store_a2p1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a2p1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4009,24 +3001,16 @@ define i40 @load_i40(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i40:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i40:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -4051,24 +3035,16 @@ define void @store_i40(i40 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i40:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_byte v1, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_byte v1, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i40:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_byte v1, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4080,22 +3056,14 @@ define i96 @load_i96(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i96:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i96:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4107,22 +3075,14 @@ define void @store_i96(i96 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i96:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i96:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4134,12 +3094,8 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i160:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_mov_b32 s4, s33
 ; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
@@ -4152,12 +3108,8 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_i160:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4173,12 +3125,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
 ; SDAG-NEXT:    s_addk_i32 s32, 0x1000
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_addk_i32 s32, 0xf000
 ; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4187,12 +3135,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: store_i160:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4204,24 +3148,16 @@ define i256 @load_i256(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i256:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i256:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4233,24 +3169,16 @@ define void @store_i256(i256 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i256:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i256:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4264,22 +3192,14 @@ define i7 @load_i7(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4291,24 +3211,16 @@ define void @store_i7(i7 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4320,22 +3232,14 @@ define i4 @load_i4(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_i4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_i4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4347,24 +3251,16 @@ define void @store_i4(i4 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_i4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4379,11 +3275,7 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; SDAG-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32
@@ -4395,11 +3287,7 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v2i4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4417,12 +3305,8 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], s32
 ; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], s32
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4432,12 +3316,8 @@ define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GISEL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4449,11 +3329,7 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v4i4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 15
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_short v0, off, s[0:3], s32
@@ -4469,11 +3345,7 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v4i4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -4496,12 +3368,8 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 12, v3
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4517,12 +3385,8 @@ define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-NEXT:    v_and_b32_e32 v1, 15, v3
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4534,11 +3398,7 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v8i4:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v7, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v7, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 15, v7
 ; SDAG-NEXT:    v_bfe_u32 v1, v7, 4, 4
@@ -4553,11 +3413,7 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v8i4:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -4591,12 +3447,8 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or3_b32 v0, v0, v3, v1
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 28, v7
 ; SDAG-NEXT:    v_and_b32_sdwa v2, v6, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4619,12 +3471,8 @@ define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v2, 15, v7
 ; GISEL-NEXT:    v_and_b32_sdwa v1, v6, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 28, v2
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4638,11 +3486,7 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v2i6:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 63, v1
 ; SDAG-NEXT:    v_bfe_u32 v1, v1, 6, 6
@@ -4651,11 +3495,7 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v2i6:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b16_e32 v1, 6, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4672,12 +3512,8 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_and_b32_e32 v0, 63, v0
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0xfff, v0
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SDAG-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4688,12 +3524,8 @@ define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v0, 63, v0
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
 ; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xfff, v0
-; GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_short v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4706,24 +3538,16 @@ define <6 x i32> @load_v32i6(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_v32i6:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_v32i6:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4736,24 +3560,16 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_v32i6:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_v32i6:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[4:5], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %data = bitcast <6 x i32> %data.abi to <32 x i6>
@@ -4768,11 +3584,7 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: volatile_load_v4i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -4782,11 +3594,7 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: volatile_load_v4i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4805,12 +3613,8 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4825,12 +3629,8 @@ define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4842,12 +3642,8 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: volatile_load_v6i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
-; SDAG-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:4 glc
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4862,12 +3658,8 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: volatile_load_v6i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:4 glc
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -4888,15 +3680,11 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; SDAG-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_store_short v4, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4911,15 +3699,11 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4931,22 +3715,14 @@ define [2 x [2 x i32]] @load_a2a2i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a2a2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a2a2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4958,22 +3734,14 @@ define void @store_a2a2i32([2 x [2 x i32]] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_a2a2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a2a2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -4985,22 +3753,14 @@ define [2 x <2 x i32>] @load_a2v2i32(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a2v2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a2v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5012,22 +3772,14 @@ define void @store_a2v2i32([2 x <2 x i32>] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_a2v2i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a2v2i32:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5039,22 +3791,14 @@ define { i32 } @load_sl_i32s(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_sl_i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_sl_i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5066,22 +3810,14 @@ define void @store_sl_i32s({ i32 } %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_sl_i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_sl_i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5093,22 +3829,14 @@ define { { float } } @load_sl_sl_f32ss(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_sl_sl_f32ss:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_sl_sl_f32ss:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5120,22 +3848,14 @@ define void @store_sl_sl_f32ss({ { float } } %data, ptr addrspace(8) inreg %buf)
 ; SDAG-LABEL: store_sl_sl_f32ss:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_sl_sl_f32ss:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5147,22 +3867,14 @@ define { <2 x i32> } @load_sl_v2i32s(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_sl_v2i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_sl_v2i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5174,22 +3886,14 @@ define void @store_sl_v2i32s({ <2 x i32> } %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_sl_v2i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_sl_v2i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5201,22 +3905,14 @@ define { i64, i32 } @load_sl_i64i32s(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_sl_i64i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_sl_i64i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5228,22 +3924,14 @@ define void @store_sl_i64i32s({ i64, i32 } %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_sl_i64i32s:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_sl_i64i32s:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5255,28 +3943,20 @@ define [4 x i7] @load_a4i7(ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: load_a4i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
-; SDAG-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; SDAG-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; SDAG-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; SDAG-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; SDAG-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; SDAG-NEXT:    buffer_load_ubyte v3, off, s[16:19], 0 offset:3
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: load_a4i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
-; GISEL-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GISEL-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:1
-; GISEL-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:2
-; GISEL-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:3
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:1
+; GISEL-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
+; GISEL-NEXT:    buffer_load_ubyte v3, off, s[16:19], 0 offset:3
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -5288,36 +3968,28 @@ define void @store_a4i7([4 x i7] %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_a4i7:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s11, s17
-; SDAG-NEXT:    s_mov_b32 s10, s16
-; SDAG-NEXT:    s_mov_b32 s9, s7
-; SDAG-NEXT:    s_mov_b32 s8, s6
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v1
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:1
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:1
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v2
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:2
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:2
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7f, v3
-; SDAG-NEXT:    buffer_store_byte v0, off, s[8:11], 0 offset:3
+; SDAG-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:3
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: store_a4i7:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, s6
-; GISEL-NEXT:    s_mov_b32 s5, s7
-; GISEL-NEXT:    s_mov_b32 s6, s16
-; GISEL-NEXT:    s_mov_b32 s7, s17
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v1
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:1
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v2
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:2
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v3
-; GISEL-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
+; GISEL-NEXT:    buffer_store_byte v0, off, s[16:19], 0 offset:3
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index e483a122615ff2..022094bc633c88 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -94,7 +94,7 @@ define void @caller(ptr addrspace(7) noundef nonnull %arg) {
 ; CHECK-NEXT:    [[V_INT_LEGAL:%.*]] = bitcast i160 [[V_INT]] to <5 x i32>
 ; CHECK-NEXT:    [[V_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[V_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[V_INT_SLICE_0]], ptr addrspace(8) align 32 [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0)
-; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add i32 [[ARG_OFF]], 16
+; CHECK-NEXT:    [[ARG_PART_4:%.*]] = add nuw i32 [[ARG_OFF]], 16
 ; CHECK-NEXT:    [[V_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[V_INT_LEGAL]], i64 4
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[V_INT_SLICE_4]], ptr addrspace(8) align 16 [[ARG_RSRC]], i32 [[ARG_PART_4]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index bf59c1669d226c..c3762e2cfff328 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -57,7 +57,7 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
-; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
+; CHECK-NEXT:    [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], i64 4, !dbg [[DBG33]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
 ; CHECK-NEXT:    ret float [[RET]], !dbg [[DBG34:![0-9]+]]



More information about the llvm-commits mailing list