[llvm] 79261d4 - [NVPTX][InferAS] assume alloca instructions are in local AS (#121710)

via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 21 14:32:57 PST 2025


Author: Alex MacLean
Date: 2025-02-21T14:32:54-08:00
New Revision: 79261d4aab4f7a0f56f5ea32a5ac06241c5cd94a

URL: https://github.com/llvm/llvm-project/commit/79261d4aab4f7a0f56f5ea32a5ac06241c5cd94a
DIFF: https://github.com/llvm/llvm-project/commit/79261d4aab4f7a0f56f5ea32a5ac06241c5cd94a.diff

LOG: [NVPTX][InferAS] assume alloca instructions are in local AS (#121710)

Added: 
    llvm/test/Transforms/InferAddressSpaces/NVPTX/alloca.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
    llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
    llvm/test/CodeGen/NVPTX/indirect_byval.ll
    llvm/test/CodeGen/NVPTX/local-stack-frame.ll
    llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
    llvm/test/CodeGen/NVPTX/lower-args.ll
    llvm/test/CodeGen/NVPTX/variadics-backend.ll
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6621aa06ac268..6f0bf510ad893 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <optional>
 
 using namespace llvm;
 
@@ -342,30 +343,28 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
   return true;
 }
 
-static unsigned int getCodeAddrSpace(MemSDNode *N) {
-  const Value *Src = N->getMemOperand()->getValue();
-
-  if (!Src)
+static std::optional<unsigned> convertAS(unsigned AS) {
+  switch (AS) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    return NVPTX::AddressSpace::Local;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    return NVPTX::AddressSpace::Global;
+  case llvm::ADDRESS_SPACE_SHARED:
+    return NVPTX::AddressSpace::Shared;
+  case llvm::ADDRESS_SPACE_GENERIC:
     return NVPTX::AddressSpace::Generic;
-
-  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
-    switch (PT->getAddressSpace()) {
-    case llvm::ADDRESS_SPACE_LOCAL:
-      return NVPTX::AddressSpace::Local;
-    case llvm::ADDRESS_SPACE_GLOBAL:
-      return NVPTX::AddressSpace::Global;
-    case llvm::ADDRESS_SPACE_SHARED:
-      return NVPTX::AddressSpace::Shared;
-    case llvm::ADDRESS_SPACE_GENERIC:
-      return NVPTX::AddressSpace::Generic;
-    case llvm::ADDRESS_SPACE_PARAM:
-      return NVPTX::AddressSpace::Param;
-    case llvm::ADDRESS_SPACE_CONST:
-      return NVPTX::AddressSpace::Const;
-    default: break;
-    }
+  case llvm::ADDRESS_SPACE_PARAM:
+    return NVPTX::AddressSpace::Param;
+  case llvm::ADDRESS_SPACE_CONST:
+    return NVPTX::AddressSpace::Const;
+  default:
+    return std::nullopt;
   }
-  return NVPTX::AddressSpace::Generic;
+}
+
+static unsigned int getCodeAddrSpace(const MemSDNode *N) {
+  return convertAS(N->getMemOperand()->getAddrSpace())
+      .value_or(NVPTX::AddressSpace::Generic);
 }
 
 namespace {

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5d2dfe76b1b98..e8eadcc4b120e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1405,6 +1405,19 @@ static bool shouldConvertToIndirectCall(const CallBase *CB,
   return false;
 }
 
+static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG,
+                                      const DataLayout &DL,
+                                      const TargetLowering &TL) {
+  if (Ptr->getOpcode() == ISD::FrameIndex) {
+    auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
+    Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,
+                               ADDRESS_SPACE_LOCAL);
+
+    return MachinePointerInfo(ADDRESS_SPACE_LOCAL);
+  }
+  return MachinePointerInfo();
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1564,11 +1577,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
 
       if (IsByVal) {
-        auto PtrVT = getPointerTy(DL);
-        SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
+        auto MPI = refinePtrAS(StVal, DAG, DL, *this);
+        const EVT PtrVT = StVal.getValueType();
+        SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
                                       DAG.getConstant(CurOffset, dl, PtrVT));
-        StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
-                            PartAlign);
+
+        StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign);
       } else if (ExtendIntegerParam) {
         assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
         // zext/sext to i32

diff  --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 85e99d7fe97a2..e216f09c02d92 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <optional>
 using namespace llvm;
@@ -564,6 +565,13 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   return nullptr;
 }
 
+unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
+  if (isa<AllocaInst>(V))
+    return ADDRESS_SPACE_LOCAL;
+
+  return -1;
+}
+
 void NVPTXTTIImpl::collectKernelLaunchBounds(
     const Function &F,
     SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

diff  --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index b0a846a9c7f96..7f69d422e8b4b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -129,6 +129,7 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
 
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const;
+  unsigned getAssumedAddrSpace(const Value *V) const;
 
   void collectKernelLaunchBounds(
       const Function &F,

diff  --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index d6c6e032f032f..3ae6300d8767d 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -17,19 +17,20 @@ define internal i32 @foo() {
 ; CHECK-NEXT:    .reg .b64 %SPL;
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot0;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
-; CHECK-NEXT:    ld.u8 %rs1, [%SP+1];
-; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    add.u64 %rd3, %SPL, 1;
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd3];
+; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 1 .b8 param0[1];
 ; CHECK-NEXT:    st.param.b8 [param0], %rs1;
 ; CHECK-NEXT:    .param .b64 param1;
-; CHECK-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
 ; CHECK-NEXT:    call (retval0),
@@ -59,19 +60,20 @@ define internal i32 @bar() {
 ; CHECK-NEXT:    .reg .b64 %SP;
 ; CHECK-NEXT:    .reg .b64 %SPL;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.u64 %SPL, __local_depot1;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
-; CHECK-NEXT:    ld.u64 %rd2, [%SP+8];
-; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    add.u64 %rd3, %SPL, 8;
+; CHECK-NEXT:    ld.local.u64 %rd4, [%rd3];
+; CHECK-NEXT:    add.u64 %rd5, %SP, 0;
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-NEXT:    st.param.b64 [param0], %rd4;
 ; CHECK-NEXT:    .param .b64 param1;
-; CHECK-NEXT:    st.param.b64 [param1], %rd3;
+; CHECK-NEXT:    st.param.b64 [param1], %rd5;
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
 ; CHECK-NEXT:    call (retval0),

diff  --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index f21ff974a2c6b..3523ffe6ae3ca 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -5,31 +6,91 @@
 
 ; Ensure we access the local stack properly
 
-; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
-; PTX32:        cvta.local.u32   %SP, %SPL;
-; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
-; PTX32:        st.volatile.u32  [%SP], %r{{[0-9]+}};
-; PTX64:        mov.u64          %SPL, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64   %SP, %SPL;
-; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
-; PTX64:        st.volatile.u32  [%SP], %r{{[0-9]+}};
 define void @foo(i32 %a) {
+; PTX32-LABEL: foo(
+; PTX32:       {
+; PTX32-NEXT:    .local .align 4 .b8 __local_depot0[4];
+; PTX32-NEXT:    .reg .b32 %SP;
+; PTX32-NEXT:    .reg .b32 %SPL;
+; PTX32-NEXT:    .reg .b32 %r<4>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    mov.u32 %SPL, __local_depot0;
+; PTX32-NEXT:    ld.param.u32 %r1, [foo_param_0];
+; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
+; PTX32-NEXT:    st.local.u32 [%r3], %r1;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: foo(
+; PTX64:       {
+; PTX64-NEXT:    .local .align 4 .b8 __local_depot0[4];
+; PTX64-NEXT:    .reg .b64 %SP;
+; PTX64-NEXT:    .reg .b64 %SPL;
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<3>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    mov.u64 %SPL, __local_depot0;
+; PTX64-NEXT:    ld.param.u32 %r1, [foo_param_0];
+; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX64-NEXT:    ret;
   %local = alloca i32, align 4
   store volatile i32 %a, ptr %local
   ret void
 }
 
-; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
-; PTX32:        cvta.local.u32   %SP, %SPL;
-; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo2_param_0];
-; PTX32:        add.u32          %r[[SP_REG:[0-9]+]], %SPL, 0;
-; PTX32:        st.local.u32  [%r[[SP_REG]]], %r{{[0-9]+}};
-; PTX64:        mov.u64          %SPL, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64   %SP, %SPL;
-; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo2_param_0];
-; PTX64:        add.u64          %rd[[SP_REG:[0-9]+]], %SPL, 0;
-; PTX64:        st.local.u32  [%rd[[SP_REG]]], %r{{[0-9]+}};
 define ptx_kernel void @foo2(i32 %a) {
+; PTX32-LABEL: foo2(
+; PTX32:       {
+; PTX32-NEXT:    .local .align 4 .b8 __local_depot1[4];
+; PTX32-NEXT:    .reg .b32 %SP;
+; PTX32-NEXT:    .reg .b32 %SPL;
+; PTX32-NEXT:    .reg .b32 %r<4>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    mov.u32 %SPL, __local_depot1;
+; PTX32-NEXT:    cvta.local.u32 %SP, %SPL;
+; PTX32-NEXT:    ld.param.u32 %r1, [foo2_param_0];
+; PTX32-NEXT:    add.u32 %r2, %SP, 0;
+; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
+; PTX32-NEXT:    st.local.u32 [%r3], %r1;
+; PTX32-NEXT:    { // callseq 0, 0
+; PTX32-NEXT:    .param .b32 param0;
+; PTX32-NEXT:    st.param.b32 [param0], %r2;
+; PTX32-NEXT:    call.uni
+; PTX32-NEXT:    bar,
+; PTX32-NEXT:    (
+; PTX32-NEXT:    param0
+; PTX32-NEXT:    );
+; PTX32-NEXT:    } // callseq 0
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: foo2(
+; PTX64:       {
+; PTX64-NEXT:    .local .align 4 .b8 __local_depot1[4];
+; PTX64-NEXT:    .reg .b64 %SP;
+; PTX64-NEXT:    .reg .b64 %SPL;
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<3>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    mov.u64 %SPL, __local_depot1;
+; PTX64-NEXT:    cvta.local.u64 %SP, %SPL;
+; PTX64-NEXT:    ld.param.u32 %r1, [foo2_param_0];
+; PTX64-NEXT:    add.u64 %rd1, %SP, 0;
+; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX64-NEXT:    { // callseq 0, 0
+; PTX64-NEXT:    .param .b64 param0;
+; PTX64-NEXT:    st.param.b64 [param0], %rd1;
+; PTX64-NEXT:    call.uni
+; PTX64-NEXT:    bar,
+; PTX64-NEXT:    (
+; PTX64-NEXT:    param0
+; PTX64-NEXT:    );
+; PTX64-NEXT:    } // callseq 0
+; PTX64-NEXT:    ret;
   %local = alloca i32, align 4
   store i32 %a, ptr %local
   call void @bar(ptr %local)
@@ -38,39 +99,120 @@ define ptx_kernel void @foo2(i32 %a) {
 
 declare void @bar(ptr %a)
 
-
-; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
-; PTX32-NOT:    cvta.local.u32   %SP, %SPL;
-; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo3_param_0];
-; PTX32:        add.u32          %r{{[0-9]+}}, %SPL, 0;
-; PTX32:        st.local.u32  [%r{{[0-9]+}}], %r{{[0-9]+}};
-; PTX64:        mov.u64          %SPL, __local_depot{{[0-9]+}};
-; PTX64-NOT:    cvta.local.u64   %SP, %SPL;
-; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo3_param_0];
-; PTX64:        add.u64          %rd{{[0-9]+}}, %SPL, 0;
-; PTX64:        st.local.u32  [%rd{{[0-9]+}}], %r{{[0-9]+}};
 define void @foo3(i32 %a) {
+; PTX32-LABEL: foo3(
+; PTX32:       {
+; PTX32-NEXT:    .local .align 4 .b8 __local_depot2[12];
+; PTX32-NEXT:    .reg .b32 %SP;
+; PTX32-NEXT:    .reg .b32 %SPL;
+; PTX32-NEXT:    .reg .b32 %r<6>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    mov.u32 %SPL, __local_depot2;
+; PTX32-NEXT:    ld.param.u32 %r1, [foo3_param_0];
+; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
+; PTX32-NEXT:    shl.b32 %r4, %r1, 2;
+; PTX32-NEXT:    add.s32 %r5, %r3, %r4;
+; PTX32-NEXT:    st.local.u32 [%r5], %r1;
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: foo3(
+; PTX64:       {
+; PTX64-NEXT:    .local .align 4 .b8 __local_depot2[12];
+; PTX64-NEXT:    .reg .b64 %SP;
+; PTX64-NEXT:    .reg .b64 %SPL;
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<5>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    mov.u64 %SPL, __local_depot2;
+; PTX64-NEXT:    ld.param.u32 %r1, [foo3_param_0];
+; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX64-NEXT:    mul.wide.s32 %rd3, %r1, 4;
+; PTX64-NEXT:    add.s64 %rd4, %rd2, %rd3;
+; PTX64-NEXT:    st.local.u32 [%rd4], %r1;
+; PTX64-NEXT:    ret;
   %local = alloca [3 x i32], align 4
   %1 = getelementptr inbounds i32, ptr %local, i32 %a
   store i32 %a, ptr %1
   ret void
 }
 
-; PTX32:        cvta.local.u32   %SP, %SPL;
-; PTX32:        add.u32          {{%r[0-9]+}}, %SP, 0;
-; PTX32:        add.u32          {{%r[0-9]+}}, %SPL, 0;
-; PTX32:        add.u32          {{%r[0-9]+}}, %SP, 4;
-; PTX32:        add.u32          {{%r[0-9]+}}, %SPL, 4;
-; PTX32:        st.local.u32     [{{%r[0-9]+}}], {{%r[0-9]+}}
-; PTX32:        st.local.u32     [{{%r[0-9]+}}], {{%r[0-9]+}}
-; PTX64:        cvta.local.u64   %SP, %SPL;
-; PTX64:        add.u64          {{%rd[0-9]+}}, %SP, 0;
-; PTX64:        add.u64          {{%rd[0-9]+}}, %SPL, 0;
-; PTX64:        add.u64          {{%rd[0-9]+}}, %SP, 4;
-; PTX64:        add.u64          {{%rd[0-9]+}}, %SPL, 4;
-; PTX64:        st.local.u32     [{{%rd[0-9]+}}], {{%r[0-9]+}}
-; PTX64:        st.local.u32     [{{%rd[0-9]+}}], {{%r[0-9]+}}
 define void @foo4() {
+; PTX32-LABEL: foo4(
+; PTX32:       {
+; PTX32-NEXT:    .local .align 4 .b8 __local_depot3[8];
+; PTX32-NEXT:    .reg .b32 %SP;
+; PTX32-NEXT:    .reg .b32 %SPL;
+; PTX32-NEXT:    .reg .b32 %r<6>;
+; PTX32-EMPTY:
+; PTX32-NEXT:  // %bb.0:
+; PTX32-NEXT:    mov.u32 %SPL, __local_depot3;
+; PTX32-NEXT:    cvta.local.u32 %SP, %SPL;
+; PTX32-NEXT:    add.u32 %r1, %SP, 0;
+; PTX32-NEXT:    add.u32 %r2, %SPL, 0;
+; PTX32-NEXT:    add.u32 %r3, %SP, 4;
+; PTX32-NEXT:    add.u32 %r4, %SPL, 4;
+; PTX32-NEXT:    mov.b32 %r5, 0;
+; PTX32-NEXT:    st.local.u32 [%r2], %r5;
+; PTX32-NEXT:    st.local.u32 [%r4], %r5;
+; PTX32-NEXT:    { // callseq 1, 0
+; PTX32-NEXT:    .param .b32 param0;
+; PTX32-NEXT:    st.param.b32 [param0], %r1;
+; PTX32-NEXT:    call.uni
+; PTX32-NEXT:    bar,
+; PTX32-NEXT:    (
+; PTX32-NEXT:    param0
+; PTX32-NEXT:    );
+; PTX32-NEXT:    } // callseq 1
+; PTX32-NEXT:    { // callseq 2, 0
+; PTX32-NEXT:    .param .b32 param0;
+; PTX32-NEXT:    st.param.b32 [param0], %r3;
+; PTX32-NEXT:    call.uni
+; PTX32-NEXT:    bar,
+; PTX32-NEXT:    (
+; PTX32-NEXT:    param0
+; PTX32-NEXT:    );
+; PTX32-NEXT:    } // callseq 2
+; PTX32-NEXT:    ret;
+;
+; PTX64-LABEL: foo4(
+; PTX64:       {
+; PTX64-NEXT:    .local .align 4 .b8 __local_depot3[8];
+; PTX64-NEXT:    .reg .b64 %SP;
+; PTX64-NEXT:    .reg .b64 %SPL;
+; PTX64-NEXT:    .reg .b32 %r<2>;
+; PTX64-NEXT:    .reg .b64 %rd<5>;
+; PTX64-EMPTY:
+; PTX64-NEXT:  // %bb.0:
+; PTX64-NEXT:    mov.u64 %SPL, __local_depot3;
+; PTX64-NEXT:    cvta.local.u64 %SP, %SPL;
+; PTX64-NEXT:    add.u64 %rd1, %SP, 0;
+; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX64-NEXT:    add.u64 %rd3, %SP, 4;
+; PTX64-NEXT:    add.u64 %rd4, %SPL, 4;
+; PTX64-NEXT:    mov.b32 %r1, 0;
+; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX64-NEXT:    st.local.u32 [%rd4], %r1;
+; PTX64-NEXT:    { // callseq 1, 0
+; PTX64-NEXT:    .param .b64 param0;
+; PTX64-NEXT:    st.param.b64 [param0], %rd1;
+; PTX64-NEXT:    call.uni
+; PTX64-NEXT:    bar,
+; PTX64-NEXT:    (
+; PTX64-NEXT:    param0
+; PTX64-NEXT:    );
+; PTX64-NEXT:    } // callseq 1
+; PTX64-NEXT:    { // callseq 2, 0
+; PTX64-NEXT:    .param .b64 param0;
+; PTX64-NEXT:    st.param.b64 [param0], %rd3;
+; PTX64-NEXT:    call.uni
+; PTX64-NEXT:    bar,
+; PTX64-NEXT:    (
+; PTX64-NEXT:    param0
+; PTX64-NEXT:    );
+; PTX64-NEXT:    } // callseq 2
+; PTX64-NEXT:    ret;
   %A = alloca i32
   %B = alloca i32
   store i32 0, ptr %A

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 28be5d7adbf8a..fe15be5663be1 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<11>;
-; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-NEXT:    .reg .b64 %rd<10>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
@@ -37,23 +37,24 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    ld.param.u8 %rs1, [non_kernel_function_param_1];
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
-; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
-; PTX-NEXT:    st.u64 [%SP+8], %rd2;
-; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd3;
-; PTX-NEXT:    mov.u64 %rd4, gi;
-; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
-; PTX-NEXT:    add.u64 %rd6, %SP, 0;
-; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
-; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
-; PTX-NEXT:    ld.u8 %r1, [%rd8];
-; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
+; PTX-NEXT:    add.u64 %rd1, %SP, 0;
+; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; PTX-NEXT:    ld.param.s32 %rd3, [non_kernel_function_param_2];
+; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.local.u64 [%rd2+8], %rd4;
+; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
+; PTX-NEXT:    st.local.u64 [%rd2], %rd5;
+; PTX-NEXT:    mov.u64 %rd6, gi;
+; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
+; PTX-NEXT:    selp.b64 %rd8, %rd1, %rd7, %p1;
+; PTX-NEXT:    add.s64 %rd9, %rd8, %rd3;
+; PTX-NEXT:    ld.u8 %r1, [%rd9];
+; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
+; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
+; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 81b86c86d40de..23cf1a85789e4 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -40,24 +40,25 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 ; PTX-NEXT:    .local .align 8 .b8 __local_depot1[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
-; PTX-NEXT:    .reg .b64 %rd<5>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot1;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u64 %rd1, [load_padding_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd1;
-; PTX-NEXT:    add.u64 %rd2, %SP, 0;
+; PTX-NEXT:    add.u64         %rd1, %SP, 0;
+; PTX-NEXT:    add.u64         %rd2, %SPL, 0;
+; PTX-NEXT:    ld.param.u64 %rd3, [load_padding_param_0];
+; PTX-NEXT:    st.local.u64 [%rd2], %rd3;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd2;
+; PTX-NEXT:    st.param.b64 [param0], %rd1;
 ; PTX-NEXT:    .param .b64 retval0;
 ; PTX-NEXT:    call.uni (retval0),
 ; PTX-NEXT:    escape,
 ; PTX-NEXT:    (
 ; PTX-NEXT:    param0
 ; PTX-NEXT:    );
-; PTX-NEXT:    ld.param.b64 %rd3, [retval0];
+; PTX-NEXT:    ld.param.b64 %rd4, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
   %tmp = call ptr @escape(ptr nonnull align 16 %arg)

diff  --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index eda4121fee702..4d4db21c6ed0d 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -148,35 +148,34 @@ entry:
 define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-LABEL: variadics2(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .local .align 2 .b8 __local_depot2[4];
+; CHECK-PTX-NEXT:    .local .align 1 .b8 __local_depot2[3];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
+; CHECK-PTX-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
-; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    ld.param.u32 %r1, [variadics2_param_0];
 ; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics2_param_1];
-; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
-; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
-; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
-; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
-; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
-; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
+; CHECK-PTX-NEXT:    add.u64 %rd3, %SPL, 0;
+; CHECK-PTX-NEXT:    add.s64 %rd4, %rd1, 7;
+; CHECK-PTX-NEXT:    and.b64 %rd5, %rd4, -8;
+; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd5];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd5+4];
+; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd5+7];
+; CHECK-PTX-NEXT:    st.local.u8 [%rd3+2], %rs1;
+; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5+6];
+; CHECK-PTX-NEXT:    st.local.u8 [%rd3+1], %rs2;
+; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd5+5];
+; CHECK-PTX-NEXT:    st.local.u8 [%rd3], %rs3;
+; CHECK-PTX-NEXT:    ld.u64 %rd6, [%rd5+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd7, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd6;
+; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd8;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-PTX-NEXT:    ret;
 entry:
@@ -213,36 +212,36 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot3[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
+; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT:    mov.u64 %rd1, __const_$_bar_$_s1;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [%rd1+7];
+; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; CHECK-PTX-NEXT:    mov.u64 %rd3, __const_$_bar_$_s1;
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [%rd3+7];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [%rd1+5];
+; CHECK-PTX-NEXT:    st.local.u8 [%rd2+2], %rs2;
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [%rd3+6];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [%rd1+6];
+; CHECK-PTX-NEXT:    st.local.u8 [%rd2+1], %rs4;
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [%rd3+5];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT:    shl.b16 %rs7, %rs6, 8;
-; CHECK-PTX-NEXT:    or.b16 %rs8, %rs7, %rs4;
-; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
+; CHECK-PTX-NEXT:    st.local.u8 [%rd2], %rs6;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
-; CHECK-PTX-NEXT:    mov.b64 %rd2, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd2;
-; CHECK-PTX-NEXT:    add.u64 %rd3, %SP, 8;
+; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
+; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT:    mov.b64 %rd4, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd4;
+; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd3;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,
@@ -381,25 +380,28 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<6>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<10>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot7;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd1, [__const_$_qux_$_s];
-; CHECK-PTX-NEXT:    st.u64 [%SP], %rd1;
-; CHECK-PTX-NEXT:    mov.u64 %rd2, __const_$_qux_$_s;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd3, [%rd2+8];
-; CHECK-PTX-NEXT:    st.u64 [%SP+8], %rd3;
-; CHECK-PTX-NEXT:    mov.b64 %rd4, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd4;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 16;
+; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
+; CHECK-PTX-NEXT:    mov.u64 %rd3, __const_$_qux_$_s;
+; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [%rd3+8];
+; CHECK-PTX-NEXT:    st.local.u64 [%rd2+8], %rd4;
+; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd5, [__const_$_qux_$_s];
+; CHECK-PTX-NEXT:    st.local.u64 [%rd2], %rd5;
+; CHECK-PTX-NEXT:    mov.b64 %rd6, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd6;
+; CHECK-PTX-NEXT:    ld.local.u64 %rd7, [%rd2];
+; CHECK-PTX-NEXT:    ld.local.u64 %rd8, [%rd2+8];
+; CHECK-PTX-NEXT:    add.u64 %rd9, %SP, 16;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
 ; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd1;
-; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd3;
+; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd7;
+; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd8;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd9;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics4,

diff  --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/alloca.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/alloca.ll
new file mode 100644
index 0000000000000..fa063cdf8d805
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/alloca.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+
+define float @load_alloca() {
+; CHECK-LABEL: define float @load_alloca() {
+; CHECK-NEXT:    [[ADDR:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[VAL:%.*]] = load float, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    ret float [[VAL]]
+;
+  %addr = alloca float
+  %val = load float, ptr %addr
+  ret float %val
+}

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index a64364019de15..820ade631dd64 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -9,21 +9,21 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    .local .align 8 .b8 __local_depot0[32];
 ; CHECK-NEXT:    .reg .b32 %SP;
 ; CHECK-NEXT:    .reg .b32 %SPL;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.u32 %SPL, __local_depot0;
-; CHECK-NEXT:    cvta.local.u32 %SP, %SPL;
 ; CHECK-NEXT:    ld.param.u32 %r1, [caller_St8x4_param_1];
+; CHECK-NEXT:    add.u32 %r3, %SPL, 0;
 ; CHECK-NEXT:    ld.param.u64 %rd1, [caller_St8x4_param_0+24];
-; CHECK-NEXT:    st.u64 [%SP+24], %rd1;
+; CHECK-NEXT:    st.local.u64 [%r3+24], %rd1;
 ; CHECK-NEXT:    ld.param.u64 %rd2, [caller_St8x4_param_0+16];
-; CHECK-NEXT:    st.u64 [%SP+16], %rd2;
+; CHECK-NEXT:    st.local.u64 [%r3+16], %rd2;
 ; CHECK-NEXT:    ld.param.u64 %rd3, [caller_St8x4_param_0+8];
-; CHECK-NEXT:    st.u64 [%SP+8], %rd3;
+; CHECK-NEXT:    st.local.u64 [%r3+8], %rd3;
 ; CHECK-NEXT:    ld.param.u64 %rd4, [caller_St8x4_param_0];
-; CHECK-NEXT:    st.u64 [%SP], %rd4;
+; CHECK-NEXT:    st.local.u64 [%r3], %rd4;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd4, %rd3};


        


More information about the llvm-commits mailing list