[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Only fold flat offsets if they are inbounds (PR #132353)

Mon Mar 31 04:37:45 PDT 2025

https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/132353

>From a8155cf5b7847a041be8d4252b20cae01d305404 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 21 Mar 2025 03:33:02 -0400
Subject: [PATCH] [AMDGPU][SDAG] Only fold flat offsets if they are inbounds

For flat memory instructions where the address is supplied as a base address
register with an immediate offset, the memory aperture test ignores the
immediate offset. Currently, ISel does not respect that, which leads to
miscompilations where valid input programs crash when the address computation
relies on the immediate offset to get the base address in the proper memory
aperture. Global or scratch instructions are not affected.

This patch only selects flat instructions with immediate offsets from address
computations with the inbounds flag: If the address computation does not leave
the bounds of the allocated object, it cannot leave the bounds of the memory
aperture and is therefore safe to handle with an immediate offset.

It also adds the inbounds flag to DAG nodes resulting from transformations:
- Address computations resulting from getObjectPtrOffset. As far as I can tell,
  this function is only used to compute addresses within accessed memory ranges,
  e.g., for loads and stores that are split during legalization.
- Reassociated inbounds adds. If both involved operations are inbounds, then so
  are operations after the transformation.
- Address computations in the SelectionDAG lowering of the memcpy/move/set
  intrinsics. Base and result of the address arithmetic there are accessed, so
  the operation must be inbounds.

It might make sense to separate these changes into their own PR, but I don't
see a way to test them without adding a use of the inbounds SDAG flag.

Affected tests:
- CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded,
  added new positive tests where we still do fold them.
- Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll: Offset folding doesn't
  seem integral to this test, so the test is not changed to make offset folding
  still happen.
- CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce prefers to base addresses
  on the potentially OOB addresses used for prefetching for memory accesses,
  that might be a separate issue to look into.
- Added memset tests to CodeGen/AMDGPU/memintrinsic-unroll.ll to make sure that
  offsets in the memset DAG lowering are still folded properly.

A similar patch for GlobalISel will follow.

Fixes SWDEV-516125.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  12 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   9 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  12 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 140 ++++---
 llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll   | 374 +++++++++++++++++-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |  17 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     | 241 +++++++++++
 .../InferAddressSpaces/AMDGPU/flat_atomic.ll  |   6 +-
 8 files changed, 717 insertions(+), 94 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 15a2370e5d8b8..aa3668d3e9aae 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1069,7 +1069,8 @@ class SelectionDAG {
                              SDValue EVL);
 
   /// Returns sum of the base pointer and offset.
-  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
+  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
+  /// default.
   SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
                                const SDNodeFlags Flags = SDNodeFlags());
   SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL,
@@ -1077,15 +1078,18 @@ class SelectionDAG {
 
   /// Create an add instruction with appropriate flags when used for
   /// addressing some offset of an object. i.e. if a load is split into multiple
-  /// components, create an add nuw from the base pointer to the offset.
+  /// components, create an add nuw inbounds from the base pointer to the
+  /// offset.
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
     // The object itself can't wrap around the address space, so it shouldn't be
     // possible for the adds of the offsets to the split parts to overflow.
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   /// Return a new CALLSEQ_START node, that starts new call frame, in which
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4487b9d510cc7..8606c80296f6b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1205,9 +1205,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
 
   if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) {
     SDNodeFlags NewFlags;
-    if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
-        Flags.hasNoUnsignedWrap())
-      NewFlags |= SDNodeFlags::NoUnsignedWrap;
+    if (N0.getOpcode() == ISD::ADD) {
+      if (N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap())
+        NewFlags |= SDNodeFlags::NoUnsignedWrap;
+      if (N0->getFlags().hasInBounds() && Flags.hasInBounds())
+        NewFlags |= SDNodeFlags::InBounds;
+    }
 
     if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3526beeb312ce..e7ae506c0286f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8178,7 +8178,7 @@ static SDValue getMemcpyLoadsAndStores(
       if (Value.getNode()) {
         Store = DAG.getStore(
             Chain, dl, Value,
-            DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
             DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
         OutChains.push_back(Store);
       }
@@ -8203,14 +8203,14 @@ static SDValue getMemcpyLoadsAndStores(
 
       Value = DAG.getExtLoad(
           ISD::EXTLOAD, dl, NVT, Chain,
-          DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+          DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
           SrcPtrInfo.getWithOffset(SrcOff), VT,
           commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
       OutLoadChains.push_back(Value.getValue(1));
 
       Store = DAG.getTruncStore(
           Chain, dl, Value,
-          DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+          DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
           DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
       OutStoreChains.push_back(Store);
     }
@@ -8347,7 +8347,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
     Value = DAG.getLoad(
         VT, dl, Chain,
-        DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
         SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
@@ -8362,7 +8362,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
     Store = DAG.getStore(
         Chain, dl, LoadValues[i],
-        DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+        DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
         DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
     OutChains.push_back(Store);
     DstOff += VTSize;
@@ -8494,7 +8494,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(
         Chain, dl, Value,
-        DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+        DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
         DstPtrInfo.getWithOffset(DstOff), Alignment,
         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
         NewAAInfo);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 536bf0c208752..62c009d06a4de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1744,72 +1744,82 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
          isFlatScratchBaseLegal(Addr))) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
-      const SIInstrInfo *TII = Subtarget->getInstrInfo();
-      if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
-        Addr = N0;
-        OffsetVal = COffsetVal;
-      } else {
-        // If the offset doesn't fit, put the low bits into the offset field and
-        // add the rest.
-        //
-        // For a FLAT instruction the hardware decides whether to access
-        // global/scratch/shared memory based on the high bits of vaddr,
-        // ignoring the offset field, so we have to ensure that when we add
-        // remainder to vaddr it still points into the same underlying object.
-        // The easiest way to do that is to make sure that we split the offset
-        // into two pieces that are both >= 0 or both <= 0.
-
-        SDLoc DL(N);
-        uint64_t RemainderOffset;
-
-        std::tie(OffsetVal, RemainderOffset) =
-            TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
-
-        SDValue AddOffsetLo =
-            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
-        if (Addr.getValueType().getSizeInBits() == 32) {
-          SmallVector<SDValue, 3> Opnds;
-          Opnds.push_back(N0);
-          Opnds.push_back(AddOffsetLo);
-          unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
-          if (Subtarget->hasAddNoCarry()) {
-            AddOp = AMDGPU::V_ADD_U32_e64;
-            Opnds.push_back(Clamp);
-          }
-          Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+      // Adding the offset to the base address in a FLAT instruction must not
+      // change the memory aperture in which the address falls. Therefore we can
+      // only fold offsets from inbounds GEPs into FLAT instructions.
+      bool IsInBounds = Addr->getFlags().hasInBounds();
+      if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
+        const SIInstrInfo *TII = Subtarget->getInstrInfo();
+        if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
+          Addr = N0;
+          OffsetVal = COffsetVal;
         } else {
-          // TODO: Should this try to use a scalar add pseudo if the base address
-          // is uniform and saddr is usable?
-          SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-          SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
-          SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                                DL, MVT::i32, N0, Sub0);
-          SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                                DL, MVT::i32, N0, Sub1);
-
-          SDValue AddOffsetHi =
-              getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
-          SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-
-          SDNode *Add =
-              CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
-                                     {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
-          SDNode *Addc = CurDAG->getMachineNode(
-              AMDGPU::V_ADDC_U32_e64, DL, VTs,
-              {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
-          SDValue RegSequenceArgs[] = {
-              CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-              SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
-          Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                MVT::i64, RegSequenceArgs),
-                         0);
+          // If the offset doesn't fit, put the low bits into the offset field
+          // and add the rest.
+          //
+          // For a FLAT instruction the hardware decides whether to access
+          // global/scratch/shared memory based on the high bits of vaddr,
+          // ignoring the offset field, so we have to ensure that when we add
+          // remainder to vaddr it still points into the same underlying object.
+          // The easiest way to do that is to make sure that we split the offset
+          // into two pieces that are both >= 0 or both <= 0.
+
+          SDLoc DL(N);
+          uint64_t RemainderOffset;
+
+          std::tie(OffsetVal, RemainderOffset) =
+              TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
+
+          SDValue AddOffsetLo =
+              getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+          SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+          if (Addr.getValueType().getSizeInBits() == 32) {
+            SmallVector<SDValue, 3> Opnds;
+            Opnds.push_back(N0);
+            Opnds.push_back(AddOffsetLo);
+            unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+            if (Subtarget->hasAddNoCarry()) {
+              AddOp = AMDGPU::V_ADD_U32_e64;
+              Opnds.push_back(Clamp);
+            }
+            Addr =
+                SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+          } else {
+            // TODO: Should this try to use a scalar add pseudo if the base
+            // address is uniform and saddr is usable?
+            SDValue Sub0 =
+                CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+            SDValue Sub1 =
+                CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+            SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                  DL, MVT::i32, N0, Sub0);
+            SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                  DL, MVT::i32, N0, Sub1);
+
+            SDValue AddOffsetHi =
+                getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+            SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+            SDNode *Add =
+                CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+                                       {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+            SDNode *Addc = CurDAG->getMachineNode(
+                AMDGPU::V_ADDC_U32_e64, DL, VTs,
+                {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+            SDValue RegSequenceArgs[] = {
+                CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
+                                          MVT::i32),
+                SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+            Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                  MVT::i64, RegSequenceArgs),
+                           0);
+          }
         }
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
index 0959687d3834c..aeeafbdb77eca 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
@@ -16,8 +16,8 @@
 ;     gep[inbounds](p, i + 3) -> gep(gep(p, i), 3)
 
 
-; FIXME the offset here should not be folded: if %p points to the beginning of
-; scratch or LDS and %i is -1, a folded offset crashes the program.
+; The offset here cannot be folded: if %p points to the beginning of scratch or
+; LDS and %i is -1, a folded offset crashes the program.
 define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX90A-LABEL: flat_offset_maybe_oob:
 ; GFX90A:       ; %bb.0:
@@ -26,7 +26,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 12, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37,7 +39,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX10-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -46,7 +50,8 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
-; GFX942-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 12
+; GFX942-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -57,9 +62,12 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -76,7 +84,10 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX12-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %idx = add nsw i32 %i, 3
@@ -156,3 +167,350 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
   %l = load i32, ptr addrspace(5) %arrayidx
   ret i32 %l
 }
+
+; If the GEP that adds the offset is inbounds, folding the offset is legal.
+define i32 @flat_offset_inbounds(ptr %p, i32 %i) {
+; GFX90A-LABEL: flat_offset_inbounds:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_offset_inbounds:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: flat_offset_inbounds:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT:    flat_load_dword v0, v[0:1] offset:12
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_offset_inbounds:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_offset_inbounds:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1] offset:12
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %p.1 = getelementptr inbounds i32, ptr %p, i32 %i
+  %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
+  %l = load i32, ptr %arrayidx
+  ret i32 %l
+}
+
+define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) {
+; GFX90A-LABEL: flat_offset_inbounds_wide:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX90A-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX90A-NEXT:    flat_load_dword v8, v[0:1] offset:28
+; GFX90A-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:12
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_store_dword v[2:3], v8 offset:16
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_offset_inbounds_wide:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    flat_load_dword v8, v[0:1] offset:28
+; GFX10-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX10-NEXT:    flat_store_dword v[2:3], v8 offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: flat_offset_inbounds_wide:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
+; GFX942-NEXT:    flat_load_dword v8, v[0:1] offset:28
+; GFX942-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:12
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_store_dword v[2:3], v8 offset:16
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_offset_inbounds_wide:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    flat_load_b32 v8, v[0:1] offset:28
+; GFX11-NEXT:    flat_load_b128 v[4:7], v[0:1] offset:12
+; GFX11-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX11-NEXT:    flat_store_b32 v[2:3], v8 offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[4:7]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_offset_inbounds_wide:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    flat_load_b32 v8, v[0:1] offset:28
+; GFX12-NEXT:    flat_load_b128 v[4:7], v[0:1] offset:12
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT:    flat_store_b32 v[2:3], v8 offset:16
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x1
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[4:7]
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %p.1 = getelementptr inbounds i32, ptr %p, i32 %i
+  %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
+  %l = load <5 x i32>, ptr %arrayidx
+  store <5 x i32> %l, ptr %pout
+  ret void
+}
+
+define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) {
+; GFX90A-LABEL: flat_offset_inbounds_very_wide:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX90A-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX90A-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:108
+; GFX90A-NEXT:    flat_load_dwordx4 v[8:11], v[0:1] offset:124
+; GFX90A-NEXT:    flat_load_dwordx4 v[12:15], v[0:1] offset:76
+; GFX90A-NEXT:    flat_load_dwordx4 v[16:19], v[0:1] offset:92
+; GFX90A-NEXT:    flat_load_dwordx4 v[20:23], v[0:1] offset:44
+; GFX90A-NEXT:    flat_load_dwordx4 v[24:27], v[0:1] offset:60
+; GFX90A-NEXT:    flat_load_dwordx4 v[28:31], v[0:1] offset:12
+; GFX90A-NEXT:    flat_load_dwordx4 v[32:35], v[0:1] offset:28
+; GFX90A-NEXT:    flat_load_dwordx4 v[36:39], v[0:1] offset:140
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[4:7] offset:96
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[8:11] offset:112
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[12:15] offset:64
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[16:19] offset:80
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[20:23] offset:32
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[24:27] offset:48
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[28:31]
+; GFX90A-NEXT:    flat_store_dwordx4 v[2:3], v[32:35] offset:16
+; GFX90A-NEXT:    flat_store_dwordx3 v[2:3], v[36:38] offset:128
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_offset_inbounds_very_wide:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX10-NEXT:    s_clause 0x8
+; GFX10-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:108
+; GFX10-NEXT:    flat_load_dwordx4 v[8:11], v[0:1] offset:124
+; GFX10-NEXT:    flat_load_dwordx4 v[12:15], v[0:1] offset:76
+; GFX10-NEXT:    flat_load_dwordx4 v[16:19], v[0:1] offset:92
+; GFX10-NEXT:    flat_load_dwordx4 v[20:23], v[0:1] offset:44
+; GFX10-NEXT:    flat_load_dwordx4 v[24:27], v[0:1] offset:60
+; GFX10-NEXT:    flat_load_dwordx4 v[28:31], v[0:1] offset:12
+; GFX10-NEXT:    flat_load_dwordx4 v[32:35], v[0:1] offset:28
+; GFX10-NEXT:    flat_load_dwordx4 v[36:39], v[0:1] offset:140
+; GFX10-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[4:7] offset:96
+; GFX10-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[8:11] offset:112
+; GFX10-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[12:15] offset:64
+; GFX10-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[16:19] offset:80
+; GFX10-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[20:23] offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[24:27] offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[28:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx4 v[2:3], v[32:35] offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(8)
+; GFX10-NEXT:    flat_store_dwordx3 v[2:3], v[36:38] offset:128
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: flat_offset_inbounds_very_wide:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1]
+; GFX942-NEXT:    flat_load_dwordx4 v[4:7], v[0:1] offset:108
+; GFX942-NEXT:    flat_load_dwordx4 v[8:11], v[0:1] offset:124
+; GFX942-NEXT:    flat_load_dwordx4 v[12:15], v[0:1] offset:76
+; GFX942-NEXT:    flat_load_dwordx4 v[16:19], v[0:1] offset:92
+; GFX942-NEXT:    flat_load_dwordx4 v[20:23], v[0:1] offset:44
+; GFX942-NEXT:    flat_load_dwordx4 v[24:27], v[0:1] offset:60
+; GFX942-NEXT:    flat_load_dwordx4 v[28:31], v[0:1] offset:12
+; GFX942-NEXT:    flat_load_dwordx4 v[32:35], v[0:1] offset:28
+; GFX942-NEXT:    flat_load_dwordx4 v[36:39], v[0:1] offset:140
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[4:7] offset:96
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[8:11] offset:112
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[12:15] offset:64
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[16:19] offset:80
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[20:23] offset:32
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[24:27] offset:48
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[28:31]
+; GFX942-NEXT:    flat_store_dwordx4 v[2:3], v[32:35] offset:16
+; GFX942-NEXT:    flat_store_dwordx3 v[2:3], v[36:38] offset:128
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_offset_inbounds_very_wide:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    flat_load_b128 v[4:7], v[0:1] offset:108
+; GFX11-NEXT:    flat_load_b128 v[8:11], v[0:1] offset:124
+; GFX11-NEXT:    flat_load_b128 v[12:15], v[0:1] offset:76
+; GFX11-NEXT:    flat_load_b128 v[16:19], v[0:1] offset:92
+; GFX11-NEXT:    flat_load_b128 v[20:23], v[0:1] offset:44
+; GFX11-NEXT:    flat_load_b128 v[24:27], v[0:1] offset:60
+; GFX11-NEXT:    flat_load_b128 v[28:31], v[0:1] offset:12
+; GFX11-NEXT:    flat_load_b128 v[32:35], v[0:1] offset:28
+; GFX11-NEXT:    flat_load_b128 v[36:39], v[0:1] offset:140
+; GFX11-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[4:7] offset:96
+; GFX11-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[8:11] offset:112
+; GFX11-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[12:15] offset:64
+; GFX11-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[16:19] offset:80
+; GFX11-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[20:23] offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[24:27] offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[28:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b128 v[2:3], v[32:35] offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(8)
+; GFX11-NEXT:    flat_store_b96 v[2:3], v[36:38] offset:128
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_offset_inbounds_very_wide:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[4:5], 2, v[4:5]
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX12-NEXT:    s_clause 0x8
+; GFX12-NEXT:    flat_load_b128 v[4:7], v[0:1] offset:108
+; GFX12-NEXT:    flat_load_b128 v[8:11], v[0:1] offset:124
+; GFX12-NEXT:    flat_load_b128 v[12:15], v[0:1] offset:76
+; GFX12-NEXT:    flat_load_b128 v[16:19], v[0:1] offset:92
+; GFX12-NEXT:    flat_load_b128 v[20:23], v[0:1] offset:44
+; GFX12-NEXT:    flat_load_b128 v[24:27], v[0:1] offset:60
+; GFX12-NEXT:    flat_load_b128 v[28:31], v[0:1] offset:12
+; GFX12-NEXT:    flat_load_b128 v[32:35], v[0:1] offset:28
+; GFX12-NEXT:    flat_load_b128 v[36:39], v[0:1] offset:140
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x808
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[4:7] offset:96
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x708
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[8:11] offset:112
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x608
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[12:15] offset:64
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x508
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[16:19] offset:80
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x408
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[20:23] offset:32
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x308
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[24:27] offset:48
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x208
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[28:31]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x108
+; GFX12-NEXT:    flat_store_b128 v[2:3], v[32:35] offset:16
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x8
+; GFX12-NEXT:    flat_store_b96 v[2:3], v[36:38] offset:128
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %p.1 = getelementptr inbounds i32, ptr %p, i32 %i
+  %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3
+  %l = load <35 x i32>, ptr %arrayidx
+  store <35 x i32> %l, ptr %pout
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 874dece6b728d..0c55c91ba8dbd 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -11,18 +11,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GFX12-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_movk_i32 s4, 0xff50
+; GFX12-NEXT:    s_mov_b32 s5, -1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
 ; GFX12-NEXT:  .LBB0_2: ; %for.body
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[8:9], s[2:3], s[4:5]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX12-NEXT:    s_add_co_i32 s6, s6, -1
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
-; GFX12-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
 ; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX12-NEXT:    flat_load_b128 v[0:3], v[0:1]
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b128 v[4:5], v[0:3]
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB0_2
@@ -37,17 +41,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ; GFX12-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GFX12-SPREFETCH-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GFX12-SPREFETCH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT:    s_movk_i32 s4, 0xff50
+; GFX12-SPREFETCH-NEXT:    s_mov_b32 s5, -1
 ; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
 ; GFX12-SPREFETCH-NEXT:  .LBB0_2: ; %for.body
 ; GFX12-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-SPREFETCH-NEXT:    s_wait_alu 0xfffe
-; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SPREFETCH-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[8:9], s[2:3], s[4:5]
 ; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX12-SPREFETCH-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
 ; GFX12-SPREFETCH-NEXT:    s_add_co_i32 s6, s6, -1
-; GFX12-SPREFETCH-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
 ; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX12-SPREFETCH-NEXT:    flat_load_b128 v[0:3], v[0:1]
 ; GFX12-SPREFETCH-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX12-SPREFETCH-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GFX12-SPREFETCH-NEXT:    s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 0f1c1cf0d80af..0fafb1dc42a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -16031,6 +16031,241 @@ entry:
   ret void
 }
 
+define void @memset_p0_sz19(ptr addrspace(0) %dst) {
+; CHECK-LABEL: memset_p0_sz19:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x41414141
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x41
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0x4141
+; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    v_mov_b32_e32 v5, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
+; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p0_sz19:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x41
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:17
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:8
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:7
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:4
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:3
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2 offset:1
+; ALIGNED-NEXT:    flat_store_byte v[0:1], v2
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p0_sz19:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT:    v_mov_b32_e32 v6, 0x41
+; UNROLL3-NEXT:    s_mov_b32 s5, s4
+; UNROLL3-NEXT:    v_mov_b32_e32 v4, s4
+; UNROLL3-NEXT:    v_mov_b32_e32 v7, 0x4141
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, s4
+; UNROLL3-NEXT:    v_mov_b32_e32 v5, s5
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, s5
+; UNROLL3-NEXT:    flat_store_byte v[0:1], v6 offset:18
+; UNROLL3-NEXT:    flat_store_short v[0:1], v7 offset:16
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull %dst, i8 65, i64 19, i1 false)
+  ret void
+}
+
+define void @memset_p1_sz19(ptr addrspace(1) %dst) {
+; CHECK-LABEL: memset_p1_sz19:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x41414141
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v5, v2
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:15
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p1_sz19:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x41
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:18
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:17
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:16
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:15
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:14
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:13
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:12
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:11
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:10
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:9
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:8
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:7
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:6
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:5
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:4
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:3
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:2
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off offset:1
+; ALIGNED-NEXT:    global_store_byte v[0:1], v2, off
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p1_sz19:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, 0x41414141
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, v2
+; UNROLL3-NEXT:    v_mov_b32_e32 v4, v2
+; UNROLL3-NEXT:    v_mov_b32_e32 v5, v2
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; UNROLL3-NEXT:    global_store_dword v[0:1], v2, off offset:15
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull %dst, i8 65, i64 19, i1 false)
+  ret void
+}
+
+define void @memset_p3_sz19(ptr addrspace(3) %dst) {
+; CHECK-LABEL: memset_p3_sz19:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x41414141
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x41
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x4141
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    ds_write_b8 v0, v3 offset:18
+; CHECK-NEXT:    ds_write_b16 v0, v4 offset:16
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p3_sz19:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_mov_b32_e32 v1, 0x41
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:18
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:17
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:16
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:15
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:14
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:13
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:12
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:11
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:10
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:9
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:8
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:7
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:6
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:5
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:4
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:3
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:2
+; ALIGNED-NEXT:    ds_write_b8 v0, v1 offset:1
+; ALIGNED-NEXT:    ds_write_b8 v0, v1
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p3_sz19:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, 0x41
+; UNROLL3-NEXT:    s_mov_b32 s5, s4
+; UNROLL3-NEXT:    v_mov_b32_e32 v4, 0x4141
+; UNROLL3-NEXT:    v_mov_b32_e32 v1, s4
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, s5
+; UNROLL3-NEXT:    ds_write_b8 v0, v3 offset:18
+; UNROLL3-NEXT:    ds_write_b16 v0, v4 offset:16
+; UNROLL3-NEXT:    ds_write2_b64 v0, v[1:2], v[1:2] offset1:1
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull %dst, i8 65, i64 19, i1 false)
+  ret void
+}
+
+define void @memset_p5_sz19(ptr addrspace(5) %dst) {
+; CHECK-LABEL: memset_p5_sz19:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x41414141
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x41
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4141
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p5_sz19:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_mov_b32_e32 v1, 0x41
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p5_sz19:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    v_mov_b32_e32 v1, 0x41414141
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, 0x41
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, 0x4141
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; UNROLL3-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull %dst, i8 65, i64 19, i1 false)
+  ret void
+}
 
 declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
@@ -16046,4 +16281,10 @@ declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr a
 
 declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
 
+declare void @llvm.memset.p0.i64(ptr addrspace(0) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3
+
 attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index 39af91b81110d..60aac9ad56f06 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
 ; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
 ; CHECK-NEXT:    s_add_u32 s0, s2, s0
 ; CHECK-NEXT:    s_addc_u32 s1, s3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; CHECK-NEXT:    s_add_u32 s0, s0, -8
+; CHECK-NEXT:    s_addc_u32 s1, s1, -1
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; CHECK-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol