[llvm] [AMDGPU] Restrict promote alloca on pointers across address spaces (PR #119762)

Thu Dec 12 13:19:07 PST 2024

https://github.com/sgundapa created https://github.com/llvm/llvm-project/pull/119762

If the load/store of a pointer to stack that is not in the same address space, we restrict the promote alloca pass not to vectorize if the pointer storage sizes are different.
Example: In address space 0, pointer size is 64 bits.
               In address space 5, pointer size if 32 bits.
Casting the pointer across these address spaces is undefined behavior.
Assertion found through fuzzing.

>From 770364bcffeef16e27f65c81ef1f5d6803e16041 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at useocpm2m-387-002.amd.com>
Date: Thu, 12 Dec 2024 13:04:20 -0800
Subject: [PATCH] [AMDGPU] Restrict promote alloca on pointers across address
 spaces

If the load/store of a pointer to stack that is not in the same address space,
we restrict the promote alloca pass not to vectorize if the pointer storage
sizes are different.
Example: In address space 0, pointer size is 64 bits.
         In address space 5, pointer size if 32 bits.
Casting the pointer across these address spaces is undefined behavior.
Assertion found through fuzzing.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 10 +++
 .../AMDGPU/promote-alloca-loadstores.ll       | 67 ++++++++++---------
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 51 +++++++-------
 3 files changed, 70 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e27ef71c1c0883..913a601b0e0888 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -796,6 +796,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       if (!IsSimple)
         return RejectUser(Inst, "not a simple load or store");
 
+      // If the access type is a pointer, reject the address spaces with
+      // different pointer sizes.
+      // store <2 x ptr> %arg, ptr addrspace(5) %alloca - Reject.
+      // %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca - ok.
+      if (AccessTy->isPtrOrPtrVectorTy()) {
+        if (DL->getPointerSize(getLoadStoreAddressSpace(Inst)) !=
+            DL->getPointerSize(AccessTy->getPointerAddressSpace()))
+          return RejectUser(Inst, "pointers to incompatible address spaces");
+      }
+
       Ptr = Ptr->stripPointerCasts();
 
       // Alloca already accessed as vector.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 1e49500a243e10..2a22cdda7a7e79 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -93,21 +93,6 @@ end:
   ret void
 }
 
-define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
-; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
-; CHECK-SAME: (ptr [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
-; CHECK-NEXT:    ret ptr [[ARG]]
-;
-entry:
-  %alloca = alloca [8 x i8], align 8, addrspace(5)
-  store ptr %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
-  ret ptr %tmp
-}
-
 define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
 ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
 ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
@@ -123,22 +108,6 @@ entry:
   ret ptr addrspace(3) %tmp
 }
 
-define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
-; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
-; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
-; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP2]]
-;
-entry:
-  %alloca = alloca [4 x i32], align 8, addrspace(5)
-  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
-  ret <4 x ptr addrspace(3)> %tmp
-}
-
 define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
 ; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
 ; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
@@ -198,3 +167,39 @@ entry:
   %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
   ret ptr addrspace(3) %tmp
 }
+
+; Will not vectorize because we are doing a load/store of a pointer across
+; address spaces of varying pointer sizes.
+define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
+; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
+; CHECK-NEXT:    store ptr [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret ptr [[TMP]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret ptr %tmp
+}
+
+; Will not vectorize because we are doing a load/store of a pointer across
+; address spaces of varying pointer sizes.
+define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x i32], align 8, addrspace(5)
+; CHECK-NEXT:    store <2 x ptr> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP]]
+;
+entry:
+  %alloca = alloca [4 x i32], align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 7c5410004ed5b7..b583f33a7d9e63 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -218,38 +218,35 @@ entry:
   ret void
 }
 
-define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) {
+define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(3)> %val.0, <4 x ptr addrspace(3)> %val.1) {
 ; CHECK-LABEL: define void @test_different_type_subvector_ptrs
-; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
+; CHECK-SAME: (<2 x ptr addrspace(3)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)>
-; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)>
-; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]]
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr <2 x i32> [[TMP5]] to <2 x ptr addrspace(3)>
+; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr <4 x i32> [[TMP15]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP16]]
 entry:
   %stack = alloca [4 x i64], align 4, addrspace(5)
 
-  store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack
-  %reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack
-  %dummyuser = freeze <2 x ptr addrspace(1)> %reload
+  store <2 x ptr addrspace(3)> %val.0, ptr addrspace(5) %stack
+  %reload = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack
+  %dummyuser = freeze <2 x ptr addrspace(3)> %reload
 
   store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack
   %reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack