[clang] [llvm] [NVPTX] Change the alloca address space in NVPTXLowerAlloca (PR #154814)

Fri Aug 22 06:01:14 PDT 2025

================
@@ -139,21 +134,20 @@ define void @foo4() {
 ; PTX32-EMPTY:
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot3;
-; PTX32-NEXT:    cvta.local.u32 %SP, %SPL;
-; PTX32-NEXT:    add.u32 %r1, %SP, 0;
-; PTX32-NEXT:    add.u32 %r2, %SPL, 0;
-; PTX32-NEXT:    add.u32 %r3, %SP, 4;
-; PTX32-NEXT:    add.u32 %r4, %SPL, 4;
-; PTX32-NEXT:    st.local.b32 [%r2], 0;
-; PTX32-NEXT:    st.local.b32 [%r4], 0;
+; PTX32-NEXT:    add.u32 %r1, %SPL, 0;
+; PTX32-NEXT:    cvta.local.u32 %r2, %r1;
+; PTX32-NEXT:    add.u32 %r3, %SPL, 4;
+; PTX32-NEXT:    cvta.local.u32 %r4, %r3;
+; PTX32-NEXT:    st.local.b32 [%SPL], 0;
+; PTX32-NEXT:    st.local.b32 [%SPL+4], 0;
----------------
thetheodor wrote:

I think the issue stems from the following: 

1. Previously the IR looked like this:
```
define void @foo4() {
  %A = alloca i32, align 4
  %1 = addrspacecast ptr %A to ptr addrspace(5)
  %B = alloca i32, align 4
  %2 = addrspacecast ptr %B to ptr addrspace(5)
  store i32 0, ptr addrspace(5) %1, align 4
  store i32 0, ptr addrspace(5) %2, align 4
  call void @bar(ptr %A)
  call void @bar(ptr %B)
  ret void
}
```
the function call arguments were generic pointers and they were lowered like so:
```
bb.0 (%ir-block.0):
  %0:b64 = LEA_ADDRi64 %stack.0.A, 0       <- generic address
  %1:b64 = cvta_to_local_64 %0:b64
  %2:b64 = LEA_ADDRi64 %stack.1.B, 0       <- generic address
  %3:b64 = cvta_to_local_64 %2:b64
   ...
  ST_i64 %0:b64, 0, 0, 101, 64, &param0, 0 :: (store (s64), addrspace 101)
  ...
  ST_i64 %2:b64, 0, 0, 101, 64, &param0, 0 :: (store (s64), addrspace 101)
  ```
  the two `cvta_to_local` are eventually optimized away:
  ```
  $vrframelocal64 = MOV_DEPOT_ADDR_64 3
  $vrframe64 = cvta_local_64 $vrframelocal64
  %0:b64 = LEA_ADDRi64 $vrframe64, 0
  %1:b64 = cvta_to_local_64 %0:b64
  %2:b64 = LEA_ADDRi64 $vrframe64, 4
  %3:b64 = cvta_to_local_64 %2:b64
  ```
  ->
  ```
  $vrframelocal64 = MOV_DEPOT_ADDR_64 3
  $vrframe64 = cvta_local_64 $vrframelocal64
  %0:b64 = LEA_ADDRi64 $vrframe64, 0
  %1:b64 = LEA_ADDRi64 $vrframelocal64, 0
  %2:b64 = LEA_ADDRi64 $vrframe64, 4
  %3:b64 = LEA_ADDRi64 $vrframelocal64, 4
  ```
2. The current IR looks like this:
```
define void @foo4() {
  %A1 = alloca i32, align 4, addrspace(5)
  %1 = addrspacecast ptr addrspace(5) %A1 to ptr
  %B2 = alloca i32, align 4, addrspace(5)
  %2 = addrspacecast ptr addrspace(5) %B2 to ptr
  store i32 0, ptr addrspace(5) %A1, align 4
  store i32 0, ptr addrspace(5) %B2, align 4
  call void @bar(ptr %1)
  call void @bar(ptr %2)
  ret void
}
```
i.e., the arguments are now the address space cast results (not the alloca pointers as previously). This is lowered to:
```
bb.0 (%ir-block.0):
  %0:b64 = LEA_ADDRi64 %stack.0.A1, 0. 
  %1:b64 = cvta_local_64 killed %0:b64             <- param0 is a cast result
  %2:b64 = LEA_ADDRi64 %stack.1.B2, 0
  %3:b64 = cvta_local_64 killed %2:b64            <- param1 is a cast result
  ...
  ST_i64 killed %1:b64, 0, 0, 101, 64, &param0, 0 :: (store (s64), addrspace 101)
  ...
  ST_i64 killed %3:b64, 0, 0, 101, 64, &param0, 0 :: (store (s64), addrspace 101)
 ```
 The prologue ends up being:
 ```
  $vrframelocal64 = MOV_DEPOT_ADDR_64 3
  %0:b64 = LEA_ADDRi64 $vrframelocal64, 0
  %1:b64 = cvta_local_64 %0:b64
  %2:b64 = LEA_ADDRi64 $vrframelocal64, 4
  %3:b64 = cvta_local_64 %2:b64
 ```
and never optimized. I am wondering if a simple peephole optimization is missing.

https://github.com/llvm/llvm-project/pull/154814