[clang] [clang][CodeGen] `sret` args should always point to the `alloca` AS, so use that (PR #114062)

Alex Voicu via cfe-commits cfe-commits at lists.llvm.org
Fri Nov 15 15:05:10 PST 2024


https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/114062

>From d2d2d3d5db3f639aab178f9ca9a20db2842d2b65 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 29 Oct 2024 14:20:44 +0000
Subject: [PATCH 1/9] `sret` args should always point to the `alloca` AS, so we
 can use that.

---
 clang/lib/CodeGen/CGCall.cpp                   | 15 ++++++++-------
 clang/test/CodeGen/partial-reinitialization2.c |  4 ++--
 clang/test/CodeGen/sret.c                      | 11 +++++++++++
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 8f4f5d3ed81601..56acfae7ae9e51 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1672,8 +1672,7 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
 
   // Add type for sret argument.
   if (IRFunctionArgs.hasSRetArg()) {
-    QualType Ret = FI.getReturnType();
-    unsigned AddressSpace = CGM.getTypes().getTargetAddressSpace(Ret);
+    unsigned AddressSpace = CGM.getDataLayout().getAllocaAddrSpace();
     ArgTypes[IRFunctionArgs.getSRetArgNo()] =
         llvm::PointerType::get(getLLVMContext(), AddressSpace);
   }
@@ -5145,7 +5144,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // If the call returns a temporary with struct return, create a temporary
   // alloca to hold the result, unless one is given to us.
   Address SRetPtr = Address::invalid();
-  RawAddress SRetAlloca = RawAddress::invalid();
   llvm::Value *UnusedReturnSizePtr = nullptr;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
     // For virtual function pointer thunks and musttail calls, we must always
@@ -5159,16 +5157,19 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     } else if (!ReturnValue.isNull()) {
       SRetPtr = ReturnValue.getAddress();
     } else {
-      SRetPtr = CreateMemTemp(RetTy, "tmp", &SRetAlloca);
+      SRetPtr = CreateMemTempWithoutCast(RetTy, "tmp");
       if (HaveInsertPoint() && ReturnValue.isUnused()) {
         llvm::TypeSize size =
             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(RetTy));
-        UnusedReturnSizePtr = EmitLifetimeStart(size, SRetAlloca.getPointer());
+        UnusedReturnSizePtr = EmitLifetimeStart(size, SRetPtr.getBasePointer());
       }
     }
     if (IRFunctionArgs.hasSRetArg()) {
+      // If the caller allocated the return slot, it is possible that the
+      // alloca was AS casted to the default as, so we ensure the cast is
+      // stripped before binding to the sret arg, which is in the allocaAS.
       IRCallArgs[IRFunctionArgs.getSRetArgNo()] =
-          getAsNaturalPointerTo(SRetPtr, RetTy);
+          getAsNaturalPointerTo(SRetPtr, RetTy)->stripPointerCasts();
     } else if (RetAI.isInAlloca()) {
       Address Addr =
           Builder.CreateStructGEP(ArgMemory, RetAI.getInAllocaFieldIndex());
@@ -5740,7 +5741,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // pop this cleanup later on. Being eager about this is OK, since this
   // temporary is 'invisible' outside of the callee.
   if (UnusedReturnSizePtr)
-    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetAlloca,
+    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetPtr,
                                          UnusedReturnSizePtr);
 
   llvm::BasicBlock *InvokeDest = CannotThrow ? nullptr : getInvokeDest();
diff --git a/clang/test/CodeGen/partial-reinitialization2.c b/clang/test/CodeGen/partial-reinitialization2.c
index e709c1d4ad1ee1..7949a69555031e 100644
--- a/clang/test/CodeGen/partial-reinitialization2.c
+++ b/clang/test/CodeGen/partial-reinitialization2.c
@@ -91,8 +91,8 @@ void test5(void)
 // CHECK-LABEL: test6
 void test6(void)
 {
-  // CHECK: [[LP:%[a-z0-9]+]] = getelementptr{{.*}}%struct.LLP2P2, ptr{{.*}}, i32 0, i32 0
-  // CHECK: call {{.*}}get456789(ptr {{.*}}[[LP]])
+  // CHECK: [[VAR:%[a-z0-9]+]] = alloca
+  // CHECK: call {{.*}}get456789(ptr {{.*}}sret{{.*}} [[VAR]])
 
   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get235()
   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9.]+]]
diff --git a/clang/test/CodeGen/sret.c b/clang/test/CodeGen/sret.c
index 6d905e89b2c6fd..3b4914f29d2bfe 100644
--- a/clang/test/CodeGen/sret.c
+++ b/clang/test/CodeGen/sret.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -Wno-strict-prototypes -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wno-strict-prototypes -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck --check-prefix=NONZEROALLOCAAS %s
 
 struct abc {
  long a;
@@ -6,18 +7,28 @@ struct abc {
  long c;
  long d;
  long e;
+ long f;
+ long g;
+ long h;
+ long i;
+ long j;
 };
 
 struct abc foo1(void);
 // CHECK-DAG: declare {{.*}} @foo1(ptr dead_on_unwind writable sret(%struct.abc)
+// NONZEROALLOCAAS-DAG: declare {{.*}} @foo1(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
 struct abc foo2();
 // CHECK-DAG: declare {{.*}} @foo2(ptr dead_on_unwind writable sret(%struct.abc)
+// NONZEROALLOCAAS-DAG: declare {{.*}} @foo2(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
 struct abc foo3(void){}
 // CHECK-DAG: define {{.*}} @foo3(ptr dead_on_unwind noalias writable sret(%struct.abc)
+// NONZEROALLOCAAS-DAG: define {{.*}} @foo3(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.abc)
 
 void bar(void) {
   struct abc dummy1 = foo1();
   // CHECK-DAG: call {{.*}} @foo1(ptr dead_on_unwind writable sret(%struct.abc)
+  // NONZEROALLOCAAS-DAG: call {{.*}} @foo1(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
   struct abc dummy2 = foo2();
   // CHECK-DAG: call {{.*}} @foo2(ptr dead_on_unwind writable sret(%struct.abc)
+  // NONZEROALLOCAAS-DAG: call {{.*}} @foo2(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
 }

>From b5a7df0a771cb70d60e58a8727a5d856219dacb3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 29 Oct 2024 17:16:17 +0000
Subject: [PATCH 2/9] Fix broken tests.

---
 clang/test/CodeGenOpenCL/addr-space-struct-arg.cl       | 4 ++--
 clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 57d056b0ff9d51..4a1db2c3564a57 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -250,7 +250,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
 // AMDGCN-NEXT:    ret void
 //
 // AMDGCN20-LABEL: define dso_local void @foo_large(
-// AMDGCN20-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
 // AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
@@ -335,7 +335,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
-// AMDGCN20-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
 // AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index 084281a8cada46..c2b2e00d15e13f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -91,7 +91,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
 }
 
 // AMDGCN-LABEL: define dso_local void @foo_large(
-// AMDGCN-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
@@ -120,7 +120,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
-// AMDGCN-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
 // AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
 // AMDGCN-NEXT:    ret void
 //

>From 2de33d4cfb210dc50a55b9ba87fa0d086d4b8d9f Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 30 Oct 2024 00:10:59 +0000
Subject: [PATCH 3/9] Handle passing an `alloca`ed `sret` arg directly to a
 callee that expects a pointer to the default AS.

---
 clang/lib/CodeGen/CGCall.cpp                    | 16 ++++++++++++----
 clang/test/CodeGenCXX/no-elide-constructors.cpp |  4 ++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 56acfae7ae9e51..7171d85b0d0ab0 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5391,11 +5391,19 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             V->getType()->isIntegerTy())
           V = Builder.CreateZExt(V, ArgInfo.getCoerceToType());
 
-        // If the argument doesn't match, perform a bitcast to coerce it.  This
-        // can happen due to trivial type mismatches.
+        // If the argument doesn't match, we are either trying to pass an
+        // alloca-ed sret argument directly, and the alloca AS does not match
+        // the default AS, case in which we AS cast it, or we have a trivial
+        // type mismatch, and thus perform a bitcast to coerce it.
         if (FirstIRArg < IRFuncTy->getNumParams() &&
-            V->getType() != IRFuncTy->getParamType(FirstIRArg))
-          V = Builder.CreateBitCast(V, IRFuncTy->getParamType(FirstIRArg));
+            V->getType() != IRFuncTy->getParamType(FirstIRArg)) {
+          auto IRTy = IRFuncTy->getParamType(FirstIRArg);
+          auto MaybeSRetArg = dyn_cast_or_null<llvm::Argument>(V);
+          if (MaybeSRetArg && MaybeSRetArg->hasStructRetAttr())
+            V = Builder.CreateAddrSpaceCast(V, IRTy);
+          else
+            V = Builder.CreateBitCast(V, IRTy);
+        }
 
         if (ArgHasMaybeUndefAttr)
           V = Builder.CreateFreeze(V);
diff --git a/clang/test/CodeGenCXX/no-elide-constructors.cpp b/clang/test/CodeGenCXX/no-elide-constructors.cpp
index 750392a43e05cc..098163f957f759 100644
--- a/clang/test/CodeGenCXX/no-elide-constructors.cpp
+++ b/clang/test/CodeGenCXX/no-elide-constructors.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -std=c++98 -triple i386-unknown-unknown -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX98
 // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11
+// RUN: %clang_cc1 -std=c++11 -triple amdgcn-amd-amdhsa -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK --check-prefix=CHECK-CXX11-NONZEROALLOCAAS
 // RUN: %clang_cc1 -std=c++98 -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX98-ELIDE
 // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11-ELIDE
+// RUN: %clang_cc1 -std=c++11 -triple amdgcn-amd-amdhsa -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11-NONZEROALLOCAAS-ELIDE
 
 // Reduced from PR12208
 class X {
@@ -23,8 +25,10 @@ X Test()
   // sret argument.
   // CHECK-CXX98: call void @_ZN1XC1ERKS_(
   // CHECK-CXX11: call void @_ZN1XC1EOS_(
+  // CHECK-CXX11-NONZEROALLOCAAS: call void @_ZN1XC1EOS_(
   // CHECK-CXX98-ELIDE-NOT: call void @_ZN1XC1ERKS_(
   // CHECK-CXX11-ELIDE-NOT: call void @_ZN1XC1EOS_(
+  // CHECK-CXX11-NONZEROALLOCAAS-ELIDE-NOT: call void @_ZN1XC1EOS_(
 
   // Make sure that the destructor for X is called.
   // FIXME: This call is present even in the -ELIDE runs, but is guarded by a

>From b209d6779cccaa9c2f272d839263cf7ca139b945 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 2 Nov 2024 00:57:17 +0000
Subject: [PATCH 4/9] Add query for a possible target specific indirect arg AS.

---
 clang/include/clang/Basic/TargetInfo.h | 8 ++++++++
 clang/lib/CodeGen/CGCall.cpp           | 6 ++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 25eda907d20a7b..fa5021baf667b5 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1780,6 +1780,14 @@ class TargetInfo : public TransferrableTargetInfo,
     return 0;
   }
 
+  /// \returns Target specific address space for indirect (e.g. sret) arguments.
+  /// If such an address space exists, it must be convertible to and from the
+  /// alloca address space. If it does not, std::nullopt is returned and the
+  /// alloca address space will be used.
+  virtual std::optional<unsigned> getIndirectArgAddressSpace() const {
+    return std::nullopt;
+  }
+
   /// \returns If a target requires an address within a target specific address
   /// space \p AddressSpace to be converted in order to be used, then return the
   /// corresponding target specific DWARF address space.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 7171d85b0d0ab0..87e70df795a986 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1672,9 +1672,11 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
 
   // Add type for sret argument.
   if (IRFunctionArgs.hasSRetArg()) {
-    unsigned AddressSpace = CGM.getDataLayout().getAllocaAddrSpace();
+    auto AddressSpace = CGM.getTarget().getIndirectArgAddressSpace();
+    if (!AddressSpace)
+      AddressSpace = getDataLayout().getAllocaAddrSpace();
     ArgTypes[IRFunctionArgs.getSRetArgNo()] =
-        llvm::PointerType::get(getLLVMContext(), AddressSpace);
+        llvm::PointerType::get(getLLVMContext(), *AddressSpace);
   }
 
   // Add type for inalloca argument.

>From ac6367be734abec8f2c46f4fe8a13e950e13578f Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 2 Nov 2024 01:20:12 +0000
Subject: [PATCH 5/9] Add more context to test.

---
 clang/test/CodeGenCXX/no-elide-constructors.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/no-elide-constructors.cpp b/clang/test/CodeGenCXX/no-elide-constructors.cpp
index 098163f957f759..994282debb0d08 100644
--- a/clang/test/CodeGenCXX/no-elide-constructors.cpp
+++ b/clang/test/CodeGenCXX/no-elide-constructors.cpp
@@ -17,6 +17,7 @@ class X {
 };
 
 // CHECK-LABEL: define{{.*}} void @_Z4Testv(
+// CHECK-SAME: ptr {{.*}}dead_on_unwind noalias writable sret([[CLASS_X:%.*]]) align 1 [[AGG_RESULT:%.*]])
 X Test()
 {
   X x;
@@ -25,7 +26,8 @@ X Test()
   // sret argument.
   // CHECK-CXX98: call void @_ZN1XC1ERKS_(
   // CHECK-CXX11: call void @_ZN1XC1EOS_(
-  // CHECK-CXX11-NONZEROALLOCAAS: call void @_ZN1XC1EOS_(
+  // CHECK-CXX11-NONZEROALLOCAAS: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
+  // CHECK-CXX11-NONZEROALLOCAAS-NEXT: call void @_ZN1XC1EOS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]
   // CHECK-CXX98-ELIDE-NOT: call void @_ZN1XC1ERKS_(
   // CHECK-CXX11-ELIDE-NOT: call void @_ZN1XC1EOS_(
   // CHECK-CXX11-NONZEROALLOCAAS-ELIDE-NOT: call void @_ZN1XC1EOS_(

>From 9ff1d0dd16bbda206753348ab9671dcfe0b5eb7b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 6 Nov 2024 13:16:03 +0200
Subject: [PATCH 6/9] Extend Indirect Args to carry an address space.

---
 clang/include/clang/CodeGen/CGFunctionInfo.h | 11 ++++++-----
 clang/lib/CodeGen/ABIInfo.cpp                |  2 +-
 clang/lib/CodeGen/ItaniumCXXABI.cpp          |  2 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp        |  2 +-
 clang/lib/CodeGen/SwiftCallingConv.cpp       |  4 ++--
 clang/lib/CodeGen/Targets/AMDGPU.cpp         |  5 +++++
 clang/lib/CodeGen/Targets/ARC.cpp            |  2 +-
 clang/lib/CodeGen/Targets/ARM.cpp            |  4 ++--
 clang/lib/CodeGen/Targets/Lanai.cpp          |  2 +-
 clang/lib/CodeGen/Targets/PPC.cpp            |  4 ++--
 clang/lib/CodeGen/Targets/X86.cpp            | 16 ++++++++--------
 11 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index 9d785d878b61dc..4ca5d2b6548124 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -206,8 +206,8 @@ class ABIArgInfo {
   static ABIArgInfo getIgnore() {
     return ABIArgInfo(Ignore);
   }
-  static ABIArgInfo getIndirect(CharUnits Alignment, bool ByVal = true,
-                                bool Realign = false,
+  static ABIArgInfo getIndirect(CharUnits Alignment, unsigned AddrSpace = 0,
+                                bool ByVal = true, bool Realign = false,
                                 llvm::Type *Padding = nullptr) {
     auto AI = ABIArgInfo(Indirect);
     AI.setIndirectAlign(Alignment);
@@ -215,6 +215,7 @@ class ABIArgInfo {
     AI.setIndirectRealign(Realign);
     AI.setSRetAfterThis(false);
     AI.setPaddingType(Padding);
+    AI.setIndirectAddrSpace(AddrSpace);
     return AI;
   }
 
@@ -232,7 +233,7 @@ class ABIArgInfo {
 
   static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true,
                                      bool Realign = false) {
-    auto AI = getIndirect(Alignment, ByVal, Realign);
+    auto AI = getIndirect(Alignment, 0, ByVal, Realign);
     AI.setInReg(true);
     return AI;
   }
@@ -422,12 +423,12 @@ class ABIArgInfo {
   }
 
   unsigned getIndirectAddrSpace() const {
-    assert(isIndirectAliased() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     return IndirectAttr.AddrSpace;
   }
 
   void setIndirectAddrSpace(unsigned AddrSpace) {
-    assert(isIndirectAliased() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     IndirectAttr.AddrSpace = AddrSpace;
   }
 
diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index edd7146dc1ac76..7ab9f0aeb60993 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -174,7 +174,7 @@ bool ABIInfo::isPromotableIntegerTypeForABI(QualType Ty) const {
 ABIArgInfo ABIInfo::getNaturalAlignIndirect(QualType Ty, bool ByVal,
                                             bool Realign,
                                             llvm::Type *Padding) const {
-  return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), ByVal,
+  return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), 0, ByVal,
                                  Realign, Padding);
 }
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 9b3c2f1b2af677..f5e2b096212f4d 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1351,7 +1351,7 @@ bool ItaniumCXXABI::classifyReturnType(CGFunctionInfo &FI) const {
   // If C++ prohibits us from making a copy, return by address.
   if (!RD->canPassInRegisters()) {
     auto Align = CGM.getContext().getTypeAlignInChars(FI.getReturnType());
-    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
     return true;
   }
   return false;
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 3802dc8bcafc49..3b5b860a1b087f 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1171,7 +1171,7 @@ bool MicrosoftCXXABI::classifyReturnType(CGFunctionInfo &FI) const {
 
   if (isIndirectReturn) {
     CharUnits Align = CGM.getContext().getTypeAlignInChars(FI.getReturnType());
-    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
 
     // MSVC always passes `this` before the `sret` parameter.
     FI.getReturnInfo().setSRetAfterThis(FI.isInstanceMethod());
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index ab2e2bd0b30646..e178c0fab5910d 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -801,7 +801,7 @@ static ABIArgInfo classifyExpandedType(SwiftAggLowering &lowering,
   if (lowering.empty()) {
     return ABIArgInfo::getIgnore();
   } else if (lowering.shouldPassIndirectly(forReturn)) {
-    return ABIArgInfo::getIndirect(alignmentForIndirect, /*byval*/ false);
+    return ABIArgInfo::getIndirect(alignmentForIndirect, 0, /*byval*/ false);
   } else {
     auto types = lowering.getCoerceAndExpandTypes();
     return ABIArgInfo::getCoerceAndExpand(types.first, types.second);
@@ -815,7 +815,7 @@ static ABIArgInfo classifyType(CodeGenModule &CGM, CanQualType type,
     auto &layout = CGM.getContext().getASTRecordLayout(record);
 
     if (mustPassRecordIndirectly(CGM, record))
-      return ABIArgInfo::getIndirect(layout.getAlignment(), /*byval*/ false);
+      return ABIArgInfo::getIndirect(layout.getAlignment(), 0, /*byval*/ false);
 
     SwiftAggLowering lowering(CGM);
     lowering.addTypedData(recordType->getDecl(), CharUnits::Zero(), layout);
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab2..c45e7020de3f52 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -105,6 +105,11 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
   if (!getCXXABI().classifyReturnType(FI))
     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
 
+  // srets / indirect returns are unconditionally in the alloca AS.
+  if (FI.getReturnInfo().isIndirect())
+    FI.getReturnInfo().setIndirectAddrSpace(
+        getDataLayout().getAllocaAddrSpace());
+
   unsigned ArgumentIndex = 0;
   const unsigned numFixedArguments = FI.getNumRequiredArgs();
 
diff --git a/clang/lib/CodeGen/Targets/ARC.cpp b/clang/lib/CodeGen/Targets/ARC.cpp
index 1904e8fdb3888a..ee0db9778bdcb0 100644
--- a/clang/lib/CodeGen/Targets/ARC.cpp
+++ b/clang/lib/CodeGen/Targets/ARC.cpp
@@ -77,7 +77,7 @@ ABIArgInfo ARCABIInfo::getIndirectByValue(QualType Ty) const {
   // Compute the byval alignment.
   const unsigned MinABIStackAlignInBytes = 4;
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true,
+  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), 0, /*ByVal=*/true,
                                  TypeAlign > MinABIStackAlignInBytes);
 }
 
diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
index 2d858fa2f3c3a3..d89a0bdff56a35 100644
--- a/clang/lib/CodeGen/Targets/ARM.cpp
+++ b/clang/lib/CodeGen/Targets/ARM.cpp
@@ -397,7 +397,7 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
     // bigger than 128-bits, they get placed in space allocated by the caller,
     // and a pointer is passed.
     return ABIArgInfo::getIndirect(
-        CharUnits::fromQuantity(getContext().getTypeAlign(Ty) / 8), false);
+        CharUnits::fromQuantity(getContext().getTypeAlign(Ty) / 8), 0, false);
   }
 
   // Support byval for ARM.
@@ -415,7 +415,7 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
   }
   if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64)) {
     assert(getABIKind() != ARMABIKind::AAPCS16_VFP && "unexpected byval");
-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign),
+    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign), 0,
                                    /*ByVal=*/true,
                                    /*Realign=*/TyAlign > ABIAlign);
   }
diff --git a/clang/lib/CodeGen/Targets/Lanai.cpp b/clang/lib/CodeGen/Targets/Lanai.cpp
index 2578fc0291e760..ffacb0ccbea53f 100644
--- a/clang/lib/CodeGen/Targets/Lanai.cpp
+++ b/clang/lib/CodeGen/Targets/Lanai.cpp
@@ -78,7 +78,7 @@ ABIArgInfo LanaiABIInfo::getIndirectResult(QualType Ty, bool ByVal,
   // Compute the byval alignment.
   const unsigned MinABIStackAlignInBytes = 4;
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true,
+  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), 0, /*ByVal=*/true,
                                  /*Realign=*/TypeAlign >
                                      MinABIStackAlignInBytes);
 }
diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
index 989e46f4b66a7d..c8796036b214f5 100644
--- a/clang/lib/CodeGen/Targets/PPC.cpp
+++ b/clang/lib/CodeGen/Targets/PPC.cpp
@@ -213,7 +213,7 @@ ABIArgInfo AIXABIInfo::classifyArgumentType(QualType Ty) const {
     CharUnits CCAlign = getParamTypeAlignment(Ty);
     CharUnits TyAlign = getContext().getTypeAlignInChars(Ty);
 
-    return ABIArgInfo::getIndirect(CCAlign, /*ByVal*/ true,
+    return ABIArgInfo::getIndirect(CCAlign, 0, /*ByVal*/ true,
                                    /*Realign*/ TyAlign > CCAlign);
   }
 
@@ -887,7 +887,7 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
     }
 
     // All other aggregates are passed ByVal.
-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign),
+    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign), 0,
                                    /*ByVal=*/true,
                                    /*Realign=*/TyAlign > ABIAlign);
   }
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 7f73bf2a65266e..f097c27bd89478 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -606,12 +606,12 @@ ABIArgInfo X86_32ABIInfo::getIndirectResult(QualType Ty, bool ByVal,
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
   unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign);
   if (StackAlign == 0)
-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true);
+    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), 0, /*ByVal=*/true);
 
   // If the stack alignment is less than the type alignment, realign the
   // argument.
   bool Realign = TypeAlign > StackAlign;
-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(StackAlign),
+  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(StackAlign), 0,
                                  /*ByVal=*/true, Realign);
 }
 
@@ -2247,7 +2247,7 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
                                                           Size));
   }
 
-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(Align));
+  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(Align), 0);
 }
 
 /// The ABI specifies that a value should be passed in a full vector XMM/YMM
@@ -3304,7 +3304,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
           return ABIArgInfo::getDirect();
         return ABIArgInfo::getExpand();
       }
-      return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+      return ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
     } else if (IsVectorCall) {
       if (FreeSSERegs >= NumElts &&
           (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())) {
@@ -3314,7 +3314,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
         return ABIArgInfo::getExpand();
       } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) {
         // HVAs are delayed and reclassified in the 2nd step.
-        return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+        return ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
       }
     }
   }
@@ -3350,7 +3350,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
       if (IsMingw64) {
         const llvm::fltSemantics *LDF = &getTarget().getLongDoubleFormat();
         if (LDF == &llvm::APFloat::x87DoubleExtended())
-          return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+          return ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
       }
       break;
 
@@ -3360,7 +3360,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
       // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
       // even though it isn't particularly efficient.
       if (!IsReturnType)
-        return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+        return ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
 
       // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
       // Clang matches them for compatibility.
@@ -3380,7 +3380,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
     // the power of 2.
     if (Width <= 64)
       return ABIArgInfo::getDirect();
-    return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+    return ABIArgInfo::getIndirect(Align, 0, /*ByVal=*/false);
   }
 
   return ABIArgInfo::getDirect();

>From 1c3e67cdebf6025aacd1900c22f033504d8e7963 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 6 Nov 2024 13:21:51 +0200
Subject: [PATCH 7/9] Fix formatting.

---
 clang/lib/CodeGen/Targets/X86.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index f097c27bd89478..6e5b46d5f91c8a 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -606,7 +606,8 @@ ABIArgInfo X86_32ABIInfo::getIndirectResult(QualType Ty, bool ByVal,
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
   unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign);
   if (StackAlign == 0)
-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), 0, /*ByVal=*/true);
+    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), 0,
+                                   /*ByVal=*/true);
 
   // If the stack alignment is less than the type alignment, realign the
   // argument.

>From c9288fc9d38c603ef120714343b2a57611fda424 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 8 Nov 2024 01:13:11 +0200
Subject: [PATCH 8/9] Drop vestigial target hook.

---
 clang/include/clang/Basic/TargetInfo.h | 8 --------
 clang/lib/CodeGen/CGCall.cpp           | 7 ++-----
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index fa5021baf667b5..25eda907d20a7b 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1780,14 +1780,6 @@ class TargetInfo : public TransferrableTargetInfo,
     return 0;
   }
 
-  /// \returns Target specific address space for indirect (e.g. sret) arguments.
-  /// If such an address space exists, it must be convertible to and from the
-  /// alloca address space. If it does not, std::nullopt is returned and the
-  /// alloca address space will be used.
-  virtual std::optional<unsigned> getIndirectArgAddressSpace() const {
-    return std::nullopt;
-  }
-
   /// \returns If a target requires an address within a target specific address
   /// space \p AddressSpace to be converted in order to be used, then return the
   /// corresponding target specific DWARF address space.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 87e70df795a986..32200ada7cf7de 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1672,11 +1672,8 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
 
   // Add type for sret argument.
   if (IRFunctionArgs.hasSRetArg()) {
-    auto AddressSpace = CGM.getTarget().getIndirectArgAddressSpace();
-    if (!AddressSpace)
-      AddressSpace = getDataLayout().getAllocaAddrSpace();
-    ArgTypes[IRFunctionArgs.getSRetArgNo()] =
-        llvm::PointerType::get(getLLVMContext(), *AddressSpace);
+    ArgTypes[IRFunctionArgs.getSRetArgNo()] = llvm::PointerType::get(
+        getLLVMContext(), FI.getReturnInfo().getIndirectAddrSpace());
   }
 
   // Add type for inalloca argument.

>From 013790c1ad46cbcc143fb30fccc1ead25947da24 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 15 Nov 2024 23:04:53 +0000
Subject: [PATCH 9/9] Tweak handling potential AS mismatches.

---
 clang/lib/CodeGen/CGCall.cpp                  | 24 ++++++++++---------
 clang/lib/CodeGen/CGExprAgg.cpp               | 19 ++++++++++-----
 .../CodeGenOpenCL/addr-space-struct-arg.cl    | 10 ++++----
 .../amdgpu-abi-struct-arg-byref.cl            | 10 ++++----
 4 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 32200ada7cf7de..41105262b5f14d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5168,7 +5168,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       // alloca was AS casted to the default as, so we ensure the cast is
       // stripped before binding to the sret arg, which is in the allocaAS.
       IRCallArgs[IRFunctionArgs.getSRetArgNo()] =
-          getAsNaturalPointerTo(SRetPtr, RetTy)->stripPointerCasts();
+          getAsNaturalPointerTo(SRetPtr, RetTy);
     } else if (RetAI.isInAlloca()) {
       Address Addr =
           Builder.CreateStructGEP(ArgMemory, RetAI.getInAllocaFieldIndex());
@@ -5390,18 +5390,20 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             V->getType()->isIntegerTy())
           V = Builder.CreateZExt(V, ArgInfo.getCoerceToType());
 
-        // If the argument doesn't match, we are either trying to pass an
-        // alloca-ed sret argument directly, and the alloca AS does not match
-        // the default AS, case in which we AS cast it, or we have a trivial
-        // type mismatch, and thus perform a bitcast to coerce it.
+        // The only plausible mismatch here would be for pointer address spaces,
+        // which can happen e.g. when passing a sret arg that is in the AllocaAS
+        // to a function that takes a pointer to and argument in the DefaultAS.
+        // We assume that the target has a reasonable mapping for the DefaultAS
+        // (it can be casted to from incoming specific ASes), and insert an AS
+        // cast to address the mismatch.
         if (FirstIRArg < IRFuncTy->getNumParams() &&
             V->getType() != IRFuncTy->getParamType(FirstIRArg)) {
-          auto IRTy = IRFuncTy->getParamType(FirstIRArg);
-          auto MaybeSRetArg = dyn_cast_or_null<llvm::Argument>(V);
-          if (MaybeSRetArg && MaybeSRetArg->hasStructRetAttr())
-            V = Builder.CreateAddrSpaceCast(V, IRTy);
-          else
-            V = Builder.CreateBitCast(V, IRTy);
+          assert(V->getType()->isPointerTy() && "Only pointers can mismatch!");
+          auto FormalAS =
+            CallInfo.arguments()[ArgNo].type.getQualifiers().getAddressSpace();
+          auto ActualAS = I->Ty.getAddressSpace();
+          V = getTargetHooks().performAddrSpaceCast(
+              *this, V, ActualAS, FormalAS, IRFuncTy->getParamType(FirstIRArg));
         }
 
         if (ArgHasMaybeUndefAttr)
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 2ad6587089f101..f9c9c5df801631 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -296,18 +296,25 @@ void AggExprEmitter::withReturnValueSlot(
                  (RequiresDestruction && Dest.isIgnored());
 
   Address RetAddr = Address::invalid();
-  RawAddress RetAllocaAddr = RawAddress::invalid();
 
   EHScopeStack::stable_iterator LifetimeEndBlock;
   llvm::Value *LifetimeSizePtr = nullptr;
   llvm::IntrinsicInst *LifetimeStartInst = nullptr;
   if (!UseTemp) {
-    RetAddr = Dest.getAddress();
+    // It is possible for the existing slot we are using directly to have been
+    // allocated in the correct AS for an indirect return, and then cast to
+    // the default AS (this is the behaviour of CreateMemTemp), however we know
+    // that the return address is expected to point to the uncasted AS, hence we
+    // strip possible pointer casts here.
+    if (Dest.getAddress().isValid())
+      RetAddr = Dest.getAddress().withPointer(
+          Dest.getAddress().getBasePointer()->stripPointerCasts(),
+          Dest.getAddress().isKnownNonNull());
   } else {
-    RetAddr = CGF.CreateMemTemp(RetTy, "tmp", &RetAllocaAddr);
+    RetAddr = CGF.CreateMemTempWithoutCast(RetTy, "tmp");
     llvm::TypeSize Size =
         CGF.CGM.getDataLayout().getTypeAllocSize(CGF.ConvertTypeForMem(RetTy));
-    LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAllocaAddr.getPointer());
+    LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAddr.getBasePointer());
     if (LifetimeSizePtr) {
       LifetimeStartInst =
           cast<llvm::IntrinsicInst>(std::prev(Builder.GetInsertPoint()));
@@ -316,7 +323,7 @@ void AggExprEmitter::withReturnValueSlot(
              "Last insertion wasn't a lifetime.start?");
 
       CGF.pushFullExprCleanup<CodeGenFunction::CallLifetimeEnd>(
-          NormalEHLifetimeMarker, RetAllocaAddr, LifetimeSizePtr);
+          NormalEHLifetimeMarker, RetAddr, LifetimeSizePtr);
       LifetimeEndBlock = CGF.EHStack.stable_begin();
     }
   }
@@ -337,7 +344,7 @@ void AggExprEmitter::withReturnValueSlot(
     // Since we're not guaranteed to be in an ExprWithCleanups, clean up
     // eagerly.
     CGF.DeactivateCleanupBlock(LifetimeEndBlock, LifetimeStartInst);
-    CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAllocaAddr.getPointer());
+    CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAddr.getBasePointer());
   }
 }
 
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 4a1db2c3564a57..effdeb9546800d 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -154,7 +154,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
 // AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -164,10 +163,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
 // AMDGCN20-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN20-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
-// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
+// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @ker(
@@ -327,7 +326,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
 // AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
 // AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
 // AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -336,7 +334,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
 // AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
 // AMDGCN20-NEXT:    ret void
 //
 // SPIR-LABEL: define dso_local spir_kernel void @ker_large(
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index c2b2e00d15e13f..2f8ba99a3e4160 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -70,7 +70,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
 // AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -80,10 +79,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
 // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
 // AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
 // AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
-// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
+// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
@@ -112,7 +111,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
 // AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
 // AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
 // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -121,7 +119,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
 // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
 // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
 // AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
 // AMDGCN-NEXT:    ret void
 //
 kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {



More information about the cfe-commits mailing list