[llvm] [AMDGPU] Improve detection of non-null addrspacecast operands (PR #82311)

Mon Feb 19 23:18:14 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-ir

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

Use IR analysis to infer when an addrspacecast operand is nonnull, then
lower it to an intrinsic that the DAG can use to skip the null check.

I did this using an intrinsic as it's non-intrusive. An alternative would have been to allow something like `!nonnull` on `addrspacecast` then lower that to a custom opcode (or add an operand to the addrspacecast MIR/DAG opcodes), but it's a lot of boilerplate for just one target's use case IMO.

I'm hoping that when we switch to GISel that we can move all this logic to the MIR level without losing info, but currently the DAG doesn't see enough so we need to act in CGP.

I separated the tests in 2 commits - before and after - so remove the first commit from the diff view if you want to see what the changes do exactly.

Solves SWDEV-316445

---

Patch is 26.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/82311.diff


6 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+7) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+82) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+17-4) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-13) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+4) 
- (added) llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll (+279) 


``````````diff

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 202fa4e8f4ea81..b8899784aba71f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3196,4 +3196,11 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
   [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
+
+/// Emit an addrspacecast without null pointer checking.
+/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
+def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
+  [llvm_anyptr_ty], [llvm_anyptr_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 1c75c5a47c9d27..e5beae49f38503 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -51,6 +51,12 @@ static cl::opt<bool> Widen16BitOps(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool> LowerAddrSpaceCast(
+    "amdgpu-codegenprepare-addrspacecast",
+    cl::desc("Detect non-null addrspacecast source and lower them early to "
+             "avoid the null pointer check"),
+    cl::ReallyHidden, cl::init(true));
+
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
                    cl::desc("Break large PHI nodes for DAGISel"),
@@ -99,6 +105,7 @@ class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
   const GCNSubtarget *ST = nullptr;
+  const AMDGPUTargetMachine *TM = nullptr;
   const TargetLibraryInfo *TLInfo = nullptr;
   AssumptionCache *AC = nullptr;
   DominatorTree *DT = nullptr;
@@ -310,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
+  bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -2013,6 +2021,78 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
   return true;
 }
 
+bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+  if (!LowerAddrSpaceCast)
+    return false;
+
+  // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
+  // This is only worthwhile for casts from/to priv/local to flat.
+  const unsigned SrcAS = I.getSrcAddressSpace();
+  const unsigned DstAS = I.getDestAddressSpace();
+
+  bool CanLower = false;
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+    CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
+                DstAS == AMDGPUAS::PRIVATE_ADDRESS);
+  else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
+    CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+                SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+  if (!CanLower)
+    return false;
+
+  // Check the Src operand, and look through Phis.
+  SmallVector<Value *, 4> WorkList;
+  DenseSet<const PHINode *> SeenPHIs;
+  WorkList.push_back(I.getOperand(0));
+  while (!WorkList.empty()) {
+    Value *Cur = getUnderlyingObject(WorkList.pop_back_val());
+
+    // Pointer cannot be null if it's a block address, GV or alloca.
+    // NOTE: We don't support extern_weak, but if we did, we'd need to check for
+    // it as the symbol could be null in such cases.
+    if (isa<BlockAddress>(Cur) || isa<GlobalValue>(Cur) || isa<AllocaInst>(Cur))
+      continue;
+
+    // Check nonnull arguments.
+    if (const auto *Arg = dyn_cast<Argument>(Cur); Arg && Arg->hasNonNullAttr())
+      continue;
+
+    // TODO: Calls that return nonnull?
+
+    // Look through PHIs - add all incoming values to the queue.
+    if (const auto *Phi = dyn_cast<PHINode>(Cur)) {
+      auto [It, Inserted] = SeenPHIs.insert(Phi);
+      if (!Inserted)
+        return false; // infinite recursion
+
+      for (auto &Inc : Phi->incoming_values())
+        WorkList.push_back(Inc.get());
+      continue;
+    }
+
+    // For all other things, use KnownBits.
+    // We either use 0 or all bits set to indicate null, so check whether the
+    // value can be zero or all ones.
+    auto SrcPtrKB =
+        computeKnownBits(Cur, *DL).trunc(DL->getPointerSizeInBits(SrcAS));
+    const auto NullVal = TM->getNullPointerValue(SrcAS);
+    assert((NullVal == 0 || NullVal == -1) &&
+           "don't know how to check for this null value!");
+    if (NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero())
+      continue;
+
+    // Value is unknown so we can't lower.
+    return false;
+  }
+
+  IRBuilder<> B(&I);
+  auto *Intrin = B.CreateIntrinsic(
+      I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
+  I.replaceAllUsesWith(Intrin);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse:
@@ -2196,6 +2276,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     return false;
 
   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+  Impl.TM = &TM;
   Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -2214,6 +2295,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
   AMDGPUCodeGenPrepareImpl Impl;
   Impl.Mod = F.getParent();
   Impl.DL = &Impl.Mod->getDataLayout();
+  Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
   Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
   Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0d3b158a0df731..aa9ccb7cf75262 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2247,10 +2247,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   MachineIRBuilder &B) const {
   MachineFunction &MF = B.getMF();
 
+  // MI can either be a G_ADDRSPACE_CAST or a
+  // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
+  assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
+         (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
+                                     Intrinsic::amdgcn_addrspacecast_nonnull));
+
   const LLT S32 = LLT::scalar(32);
   Register Dst = MI.getOperand(0).getReg();
-  Register Src = MI.getOperand(1).getReg();
-
+  Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+                                     : MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(Dst);
   LLT SrcTy = MRI.getType(Src);
   unsigned DestAS = DstTy.getAddressSpace();
@@ -2263,6 +2269,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   const AMDGPUTargetMachine &TM
     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 
+  // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
+  // G_ADDRSPACE_CAST we need to guess.
+  const bool IsKnownNonNull =
+      isa<GIntrinsic>(MI) ? true : isKnownNonNull(Src, MRI, TM, SrcAS);
+
   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
     return true;
@@ -2271,7 +2282,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
-    if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+    if (IsKnownNonNull) {
       // Extract low 32-bits of the pointer.
       B.buildExtract(Dst, Src, 0);
       MI.eraseFromParent();
@@ -2308,7 +2319,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     // avoid the ptrtoint?
     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
 
-    if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+    if (IsKnownNonNull) {
       B.buildCopy(Dst, BuildPtr);
       MI.eraseFromParent();
       return true;
@@ -7021,6 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
     return false;
   }
+  case Intrinsic::amdgcn_addrspacecast_nonnull:
+    return legalizeAddrSpaceCast(MI, MRI, B);
   case Intrinsic::amdgcn_make_buffer_rsrc:
     return legalizePointerAsRsrcIntrin(MI, MRI, B);
   case Intrinsic::amdgcn_kernarg_segment_ptr:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7be670f8e76c37..d2d761e36ee084 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1415,6 +1415,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
 }
 
+void SITargetLowering::CollectTargetIntrinsicOperands(
+    const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
+  switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
+  case Intrinsic::amdgcn_addrspacecast_nonnull: {
+    // The DAG's ValueType loses the addrspaces.
+    // Add them as 2 extra Constant operands "from" and "to".
+    unsigned SrcAS =
+        I.getOperand(0)->getType()->getScalarType()->getPointerAddressSpace();
+    unsigned DstAS = I.getType()->getScalarType()->getPointerAddressSpace();
+    Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
+    Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
                                             SmallVectorImpl<Value*> &Ops,
                                             Type *&AccessTy) const {
@@ -6649,24 +6667,37 @@ static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
-
-  SDValue Src = ASC->getOperand(0);
-  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
-  unsigned SrcAS = ASC->getSrcAddressSpace();
 
   const AMDGPUTargetMachine &TM =
     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
 
+  unsigned DestAS, SrcAS;
+  SDValue Src;
+  bool KnownNonNull;
+  if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
+    SrcAS = ASC->getSrcAddressSpace();
+    Src = ASC->getOperand(0);
+    DestAS = ASC->getDestAddressSpace();
+    KnownNonNull = isKnownNonNull(Op, DAG, TM, SrcAS);
+  } else {
+    assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+           Op.getConstantOperandVal(0) ==
+               Intrinsic::amdgcn_addrspacecast_nonnull);
+    Src = Op->getOperand(1);
+    SrcAS = Op->getConstantOperandVal(2);
+    DestAS = Op->getConstantOperandVal(3);
+    KnownNonNull = true;
+  }
+
+  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
   // flat -> local/private
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
-    unsigned DestAS = ASC->getDestAddressSpace();
-
     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
-      if (isKnownNonNull(Src, DAG, TM, SrcAS))
+      if (KnownNonNull)
         return Ptr;
 
       unsigned NullVal = TM.getNullPointerValue(DestAS);
@@ -6679,16 +6710,16 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   }
 
   // local/private -> flat
-  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+  if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
 
-      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
+      SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
       SDValue CvtPtr =
           DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
       CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
 
-      if (isKnownNonNull(Src, DAG, TM, SrcAS))
+      if (KnownNonNull)
         return CvtPtr;
 
       unsigned NullVal = TM.getNullPointerValue(SrcAS);
@@ -6711,7 +6742,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
   }
 
-  if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
       Src.getValueType() == MVT::i64)
     return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
@@ -6722,7 +6753,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
 
-  return DAG.getUNDEF(ASC->getValueType(0));
+  return DAG.getUNDEF(Op->getValueType(0));
 }
 
 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
@@ -8339,6 +8370,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                         Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
                         IndexKeyi32, Op.getOperand(7)});
   }
+  case Intrinsic::amdgcn_addrspacecast_nonnull:
+    return lowerADDRSPACECAST(Op, DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e436c23af5bcac..6ec9e068c49042 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -305,6 +305,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                           MachineFunction &MF,
                           unsigned IntrinsicID) const override;
 
+  void CollectTargetIntrinsicOperands(const CallInst &I,
+                                      SmallVectorImpl<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
   bool getAddrModeArguments(IntrinsicInst * /*I*/,
                             SmallVectorImpl<Value*> &/*Ops*/,
                             Type *&/*AccessTy*/) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
new file mode 100644
index 00000000000000..b5bf369c7dd0c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
+
+; Tests that we can avoid nullptr checks for addrspacecasts from/to priv/local.
+;
+; Whenever a testcase is successful, we should see the addrspacecast replaced with the intrinsic
+; and the resulting code should have no select/cndmask null check for the pointer.
+
+define void @local_to_flat_nonnull_arg(ptr addrspace(3) nonnull %ptr) {
+; OPT-LABEL: define void @local_to_flat_nonnull_arg(
+; OPT-SAME: ptr addrspace(3) nonnull [[PTR:%.*]]) {
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]])
+; OPT-NEXT:    store volatile i32 7, ptr [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: local_to_flat_nonnull_arg:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; ASM-NEXT:    v_mov_b32_e32 v1, s5
+; ASM-NEXT:    v_mov_b32_e32 v2, 7
+; ASM-NEXT:    flat_store_dword v[0:1], v2
+; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %x = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 7, ptr %x
+  ret void
+}
+
+define void @private_to_flat_nonnull_arg(ptr addrspace(5) nonnull %ptr) {
+; OPT-LABEL: define void @private_to_flat_nonnull_arg(
+; OPT-SAME: ptr addrspace(5) nonnull [[PTR:%.*]]) {
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[PTR]])
+; OPT-NEXT:    store volatile i32 7, ptr [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: private_to_flat_nonnull_arg:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT:    v_mov_b32_e32 v1, s5
+; ASM-NEXT:    v_mov_b32_e32 v2, 7
+; ASM-NEXT:    flat_store_dword v[0:1], v2
+; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %x = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 7, ptr %x
+  ret void
+}
+
+define void @flat_to_local_nonnull_arg(ptr nonnull %ptr) {
+; OPT-LABEL: define void @flat_to_local_nonnull_arg(
+; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(3) @llvm.amdgcn.addrspacecast.nonnull.p3.p0(ptr [[PTR]])
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: flat_to_local_nonnull_arg:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    v_mov_b32_e32 v1, 7
+; ASM-NEXT:    ds_write_b32 v0, v1
+; ASM-NEXT:    s_waitcnt lgkmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %x = addrspacecast ptr %ptr to ptr addrspace(3)
+  store volatile i32 7, ptr addrspace(3) %x
+  ret void
+}
+
+define void @flat_to_private_nonnull_arg(ptr nonnull %ptr) {
+; OPT-LABEL: define void @flat_to_private_nonnull_arg(
+; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr [[PTR]])
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(5) [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: flat_to_private_nonnull_arg:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    v_mov_b32_e32 v1, 7
+; ASM-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; ASM-NEXT:    s_waitcnt vmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %x = addrspacecast ptr %ptr to ptr addrspace(5)
+  store volatile i32 7, ptr addrspace(5) %x
+  ret void
+}
+
+define void @private_alloca_to_flat(ptr %ptr) {
+; OPT-LABEL: define void @private_alloca_to_flat(
+; OPT-SAME: ptr [[PTR:%.*]]) {
+; OPT-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[ALLOCA]])
+; OPT-NEXT:    store volatile i32 7, ptr [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: private_alloca_to_flat:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; ASM-NEXT:    v_mov_b32_e32 v1, s5
+; ASM-NEXT:    v_mov_b32_e32 v2, 7
+; ASM-NEXT:    flat_store_dword v[0:1], v2
+; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i8, addrspace(5)
+  %x = addrspacecast ptr addrspace(5) %alloca to ptr
+  store volatile i32 7, ptr %x
+  ret void
+}
+
+ at lds = internal unnamed_addr addrspace(3) global i8 undef, align 4
+
+define void @knownbits_on_flat_to_priv(ptr %ptr) {
+; OPT-LABEL: define void @knownbits_on_flat_to_priv(
+; OPT-SAME: ptr [[PTR:%.*]]) {
+; OPT-NEXT:    [[PTR_INT:%.*]] = ptrtoint ptr [[PTR]] to i64
+; OPT-NEXT:    [[PTR_OR:%.*]] = or i64 [[PTR_INT]], 15
+; OPT-NEXT:    [[KB_PTR:%.*]] = inttoptr i64 [[PTR_OR]] to ptr
+; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr [[KB_PTR]])
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(5) [[TMP1]], align 4
+; OPT-NEXT:    ret void
+;
+; ASM-LABEL: knownbits_on_flat_to_priv:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT:    v_or_b32_e32 v0, 15, v0
+; ASM-NEXT:    v_mov_b32_e32 v1, 7
+; ASM-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; ASM-NEXT:    s_waitcnt vmcnt(0)
+; ASM-NEXT:    s_setpc_b64 s[30:31]
+  %ptr.int = ptrtoint ptr %ptr to i64
+  %ptr.or = or i64 %ptr.int, 15 ; set some low bits
+  %kb.ptr = inttoptr i64 %ptr.or to ptr
+  %x = addrspacecast ptr %kb.ptr to ptr addrspace(5)
+  store volatile i32 7, ptr addr...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/82311