[llvm-branch-commits] [clang] release/19.x: Fix codegen of consteval functions returning an empty class, and related issues (#93115) (PR #102070)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Aug 5 14:57:37 PDT 2024
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/102070
Backport 1762e01cca0186f1862db561cfd9019164b8c654
Requested by: @efriedma-quic
>From c0d8a44634041ad0a03a37a1af49e785c3145497 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma at quicinc.com>
Date: Thu, 1 Aug 2024 16:18:20 -0700
Subject: [PATCH] Fix codegen of consteval functions returning an empty class,
and related issues (#93115)
Fix codegen of consteval functions returning an empty class, and related
issues
If a class is empty, don't store it to memory: the store might overwrite
useful data. Similarly, if a class has tail padding that might overlap
other fields, don't store the tail padding to memory.
The problem here turned out a bit more general than I initially thought:
basically all uses of EmitAggregateStore were broken. Call lowering had
a method that did mostly the right thing, though: CreateCoercedStore.
Adapt CreateCoercedStore so it always does the conservatively right
thing, and use it for both calls and ConstantExpr.
Also, along the way, fix the "overlap" bit in AggValueSlot: the bit was
set incorrectly for empty classes in some cases.
Fixes #93040.
(cherry picked from commit 1762e01cca0186f1862db561cfd9019164b8c654)
---
clang/lib/CodeGen/CGCall.cpp | 146 ++++++++----------
clang/lib/CodeGen/CGExprAgg.cpp | 23 +--
clang/lib/CodeGen/CodeGenFunction.h | 7 +-
clang/test/CodeGen/arm-mve-intrinsics/vld24.c | 43 ++++--
clang/test/CodeGen/arm-vfp16-arguments2.cpp | 10 +-
.../amdgpu-kernel-arg-pointer-type.cu | 11 +-
clang/test/CodeGenCUDA/builtins-amdgcn.cu | 125 +++++++--------
.../test/CodeGenCUDA/builtins-spirv-amdgcn.cu | 123 +++++++--------
.../CodeGenCXX/address-space-cast-coerce.cpp | 6 +-
clang/test/CodeGenCXX/cxx2a-consteval.cpp | 24 ++-
clang/test/CodeGenCXX/trivial_abi.cpp | 20 +++
clang/test/CodeGenHIP/dpp-const-fold.hip | 8 +-
.../spirv-amdgcn-dpp-const-fold.hip | 8 +-
.../CodeGenOpenCL/addr-space-struct-arg.cl | 11 +-
.../amdgpu-abi-struct-arg-byref.cl | 42 +++--
15 files changed, 320 insertions(+), 287 deletions(-)
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 234a9c16e39df..70e2f37eb40e1 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1336,75 +1336,50 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
return CGF.Builder.CreateLoad(Tmp);
}
-// Function to store a first-class aggregate into memory. We prefer to
-// store the elements rather than the aggregate to be more friendly to
-// fast-isel.
-// FIXME: Do we need to recurse here?
-void CodeGenFunction::EmitAggregateStore(llvm::Value *Val, Address Dest,
- bool DestIsVolatile) {
- // Prefer scalar stores to first-class aggregate stores.
- if (llvm::StructType *STy = dyn_cast<llvm::StructType>(Val->getType())) {
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- Address EltPtr = Builder.CreateStructGEP(Dest, i);
- llvm::Value *Elt = Builder.CreateExtractValue(Val, i);
- Builder.CreateStore(Elt, EltPtr, DestIsVolatile);
- }
- } else {
- Builder.CreateStore(Val, Dest, DestIsVolatile);
- }
-}
-
-/// CreateCoercedStore - Create a store to \arg DstPtr from \arg Src,
-/// where the source and destination may have different types. The
-/// destination is known to be aligned to \arg DstAlign bytes.
-///
-/// This safely handles the case when the src type is larger than the
-/// destination type; the upper bits of the src will be lost.
-static void CreateCoercedStore(llvm::Value *Src,
- Address Dst,
- bool DstIsVolatile,
- CodeGenFunction &CGF) {
- llvm::Type *SrcTy = Src->getType();
- llvm::Type *DstTy = Dst.getElementType();
- if (SrcTy == DstTy) {
- CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
- return;
- }
-
- llvm::TypeSize SrcSize = CGF.CGM.getDataLayout().getTypeAllocSize(SrcTy);
-
- if (llvm::StructType *DstSTy = dyn_cast<llvm::StructType>(DstTy)) {
- Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy,
- SrcSize.getFixedValue(), CGF);
- DstTy = Dst.getElementType();
- }
-
- llvm::PointerType *SrcPtrTy = llvm::dyn_cast<llvm::PointerType>(SrcTy);
- llvm::PointerType *DstPtrTy = llvm::dyn_cast<llvm::PointerType>(DstTy);
- if (SrcPtrTy && DstPtrTy &&
- SrcPtrTy->getAddressSpace() != DstPtrTy->getAddressSpace()) {
- Src = CGF.Builder.CreateAddrSpaceCast(Src, DstTy);
- CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
+void CodeGenFunction::CreateCoercedStore(llvm::Value *Src, Address Dst,
+ llvm::TypeSize DstSize,
+ bool DstIsVolatile) {
+ if (!DstSize)
return;
- }
- // If the source and destination are integer or pointer types, just do an
- // extension or truncation to the desired type.
- if ((isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy)) &&
- (isa<llvm::IntegerType>(DstTy) || isa<llvm::PointerType>(DstTy))) {
- Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, CGF);
- CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
- return;
+ llvm::Type *SrcTy = Src->getType();
+ llvm::TypeSize SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
+
+ // GEP into structs to try to make types match.
+ // FIXME: This isn't really that useful with opaque types, but it impacts a
+ // lot of regression tests.
+ if (SrcTy != Dst.getElementType()) {
+ if (llvm::StructType *DstSTy =
+ dyn_cast<llvm::StructType>(Dst.getElementType())) {
+ assert(!SrcSize.isScalable());
+ Dst = EnterStructPointerForCoercedAccess(Dst, DstSTy,
+ SrcSize.getFixedValue(), *this);
+ }
}
- llvm::TypeSize DstSize = CGF.CGM.getDataLayout().getTypeAllocSize(DstTy);
-
- // If store is legal, just bitcast the src pointer.
- if (isa<llvm::ScalableVectorType>(SrcTy) ||
- isa<llvm::ScalableVectorType>(DstTy) ||
- SrcSize.getFixedValue() <= DstSize.getFixedValue()) {
- Dst = Dst.withElementType(SrcTy);
- CGF.EmitAggregateStore(Src, Dst, DstIsVolatile);
+ if (SrcSize.isScalable() || SrcSize <= DstSize) {
+ if (SrcTy->isIntegerTy() && Dst.getElementType()->isPointerTy() &&
+ SrcSize == CGM.getDataLayout().getTypeAllocSize(Dst.getElementType())) {
+ // If the value is supposed to be a pointer, convert it before storing it.
+ Src = CoerceIntOrPtrToIntOrPtr(Src, Dst.getElementType(), *this);
+ Builder.CreateStore(Src, Dst, DstIsVolatile);
+ } else if (llvm::StructType *STy =
+ dyn_cast<llvm::StructType>(Src->getType())) {
+ // Prefer scalar stores to first-class aggregate stores.
+ Dst = Dst.withElementType(SrcTy);
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ Address EltPtr = Builder.CreateStructGEP(Dst, i);
+ llvm::Value *Elt = Builder.CreateExtractValue(Src, i);
+ Builder.CreateStore(Elt, EltPtr, DstIsVolatile);
+ }
+ } else {
+ Builder.CreateStore(Src, Dst.withElementType(SrcTy), DstIsVolatile);
+ }
+ } else if (SrcTy->isIntegerTy()) {
+ // If the source is a simple integer, coerce it directly.
+ llvm::Type *DstIntTy = Builder.getIntNTy(DstSize.getFixedValue() * 8);
+ Src = CoerceIntOrPtrToIntOrPtr(Src, DstIntTy, *this);
+ Builder.CreateStore(Src, Dst.withElementType(DstIntTy), DstIsVolatile);
} else {
// Otherwise do coercion through memory. This is stupid, but
// simple.
@@ -1416,12 +1391,12 @@ static void CreateCoercedStore(llvm::Value *Src,
// FIXME: Assert that we aren't truncating non-padding bits when have access
// to that information.
RawAddress Tmp =
- CreateTempAllocaForCoercion(CGF, SrcTy, Dst.getAlignment());
- CGF.Builder.CreateStore(Src, Tmp);
- CGF.Builder.CreateMemCpy(
- Dst.emitRawPointer(CGF), Dst.getAlignment().getAsAlign(),
- Tmp.getPointer(), Tmp.getAlignment().getAsAlign(),
- llvm::ConstantInt::get(CGF.IntPtrTy, DstSize.getFixedValue()));
+ CreateTempAllocaForCoercion(*this, SrcTy, Dst.getAlignment());
+ Builder.CreateStore(Src, Tmp);
+ Builder.CreateMemCpy(Dst.emitRawPointer(*this),
+ Dst.getAlignment().getAsAlign(), Tmp.getPointer(),
+ Tmp.getAlignment().getAsAlign(),
+ Builder.CreateTypeSize(IntPtrTy, DstSize));
}
}
@@ -3309,7 +3284,12 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
assert(NumIRArgs == 1);
auto AI = Fn->getArg(FirstIRArg);
AI->setName(Arg->getName() + ".coerce");
- CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, *this);
+ CreateCoercedStore(
+ AI, Ptr,
+ llvm::TypeSize::getFixed(
+ getContext().getTypeSizeInChars(Ty).getQuantity() -
+ ArgI.getDirectOffset()),
+ /*DstIsVolatile=*/false);
}
// Match to what EmitParmDecl is expecting for this type.
@@ -5939,17 +5919,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
llvm::Value *Imag = Builder.CreateExtractValue(CI, 1);
return RValue::getComplex(std::make_pair(Real, Imag));
}
- case TEK_Aggregate: {
- Address DestPtr = ReturnValue.getAddress();
- bool DestIsVolatile = ReturnValue.isVolatile();
-
- if (!DestPtr.isValid()) {
- DestPtr = CreateMemTemp(RetTy, "agg.tmp");
- DestIsVolatile = false;
- }
- EmitAggregateStore(CI, DestPtr, DestIsVolatile);
- return RValue::getAggregate(DestPtr);
- }
+ case TEK_Aggregate:
+ break;
case TEK_Scalar: {
// If the argument doesn't match, perform a bitcast to coerce it.
// This can happen due to trivial type mismatches.
@@ -5959,7 +5930,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
return RValue::get(V);
}
}
- llvm_unreachable("bad evaluation kind");
}
// If coercing a fixed vector from a scalable vector for ABI
@@ -5981,10 +5951,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
Address DestPtr = ReturnValue.getValue();
bool DestIsVolatile = ReturnValue.isVolatile();
+ uint64_t DestSize =
+ getContext().getTypeInfoDataSizeInChars(RetTy).Width.getQuantity();
if (!DestPtr.isValid()) {
DestPtr = CreateMemTemp(RetTy, "coerce");
DestIsVolatile = false;
+ DestSize = getContext().getTypeSizeInChars(RetTy).getQuantity();
}
// An empty record can overlap other data (if declared with
@@ -5993,7 +5966,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
if (!isEmptyRecord(getContext(), RetTy, true)) {
// If the value is offset in memory, apply the offset now.
Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
- CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+ CreateCoercedStore(
+ CI, StorePtr,
+ llvm::TypeSize::getFixed(DestSize - RetAI.getDirectOffset()),
+ DestIsVolatile);
}
return convertTempToRValue(DestPtr, RetTy, SourceLocation());
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index c3c10e73ff05e..d9f44f4be617e 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -131,15 +131,12 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
EnsureDest(E->getType());
if (llvm::Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
- Address StoreDest = Dest.getAddress();
- // The emitted value is guaranteed to have the same size as the
- // destination but can have a different type. Just do a bitcast in this
- // case to avoid incorrect GEPs.
- if (Result->getType() != StoreDest.getType())
- StoreDest = StoreDest.withElementType(Result->getType());
-
- CGF.EmitAggregateStore(Result, StoreDest,
- E->getType().isVolatileQualified());
+ CGF.CreateCoercedStore(
+ Result, Dest.getAddress(),
+ llvm::TypeSize::getFixed(
+ Dest.getPreferredSize(CGF.getContext(), E->getType())
+ .getQuantity()),
+ E->getType().isVolatileQualified());
return;
}
return Visit(E->getSubExpr());
@@ -2050,6 +2047,10 @@ CodeGenFunction::getOverlapForFieldInit(const FieldDecl *FD) {
if (!FD->hasAttr<NoUniqueAddressAttr>() || !FD->getType()->isRecordType())
return AggValueSlot::DoesNotOverlap;
+ // Empty fields can overlap earlier fields.
+ if (FD->getType()->getAsCXXRecordDecl()->isEmpty())
+ return AggValueSlot::MayOverlap;
+
// If the field lies entirely within the enclosing class's nvsize, its tail
// padding cannot overlap any already-initialized object. (The only subobjects
// with greater addresses that might already be initialized are vbases.)
@@ -2072,6 +2073,10 @@ AggValueSlot::Overlap_t CodeGenFunction::getOverlapForBaseInit(
if (IsVirtual)
return AggValueSlot::MayOverlap;
+ // Empty bases can overlap earlier bases.
+ if (BaseRD->isEmpty())
+ return AggValueSlot::MayOverlap;
+
// If the base class is laid out entirely within the nvsize of the derived
// class, its tail padding cannot yet be initialized, so we can issue
// stores at the full width of the base class.
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ba7b565d97559..60e6841e1b3d6 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4838,9 +4838,10 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitAggFinalDestCopy(QualType Type, AggValueSlot Dest, const LValue &Src,
ExprValueKind SrcKind);
- /// Build all the stores needed to initialize an aggregate at Dest with the
- /// value Val.
- void EmitAggregateStore(llvm::Value *Val, Address Dest, bool DestIsVolatile);
+ /// Create a store to \arg DstPtr from \arg Src, truncating the stored value
+ /// to at most \arg DstSize bytes.
+ void CreateCoercedStore(llvm::Value *Src, Address Dst, llvm::TypeSize DstSize,
+ bool DstIsVolatile);
/// EmitExtendGCLifetime - Given a pointer to an Objective-C object,
/// make sure it survives garbage collection until this point.
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
index 03c870e281549..15619bef5373d 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
@@ -48,10 +48,13 @@ uint8x16x4_t test_vld4q_u8(const uint8_t *addr)
// CHECK-LABEL: @test_vst2q_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0)
-// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 0
+// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP0]], 1
+// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR:%.*]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 0)
+// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v4i32(ptr [[ADDR]], <4 x i32> [[DOTFCA_0_EXTRACT]], <4 x i32> [[DOTFCA_1_EXTRACT]], i32 1)
// CHECK-NEXT: ret void
//
void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value)
@@ -65,14 +68,19 @@ void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value)
// CHECK-LABEL: @test_vst4q_s8(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 2
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] [[VALUE_COERCE]], 0, 3
-// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 0)
-// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 1)
-// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 2)
-// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 3)
+// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 0
+// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 1
+// CHECK-NEXT: [[DOTFCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 2
+// CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[TMP0]], 3
+// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[DOTFCA_2_EXTRACT]], 0, 2
+// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[DOTFCA_3_EXTRACT]], 0, 3
+// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR:%.*]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 0)
+// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 1)
+// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 2)
+// CHECK-NEXT: call void @llvm.arm.mve.vst4q.p0.v16i8(ptr [[ADDR]], <16 x i8> [[DOTFCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_EXTRACT]], i32 3)
// CHECK-NEXT: ret void
//
void test_vst4q_s8(int8_t *addr, int8x16x4_t value)
@@ -86,10 +94,13 @@ void test_vst4q_s8(int8_t *addr, int8x16x4_t value)
// CHECK-LABEL: @test_vst2q_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0, 0
-// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[VALUE_COERCE]], 0, 1
-// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0)
-// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <8 x half> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T:%.*]] [[VALUE_COERCE:%.*]], 0
+// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 0
+// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP0]], 1
+// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0
+// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1
+// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR:%.*]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 0)
+// CHECK-NEXT: call void @llvm.arm.mve.vst2q.p0.v8f16(ptr [[ADDR]], <8 x half> [[DOTFCA_0_EXTRACT]], <8 x half> [[DOTFCA_1_EXTRACT]], i32 1)
// CHECK-NEXT: ret void
//
void test_vst2q_f16(float16_t *addr, float16x8x2_t value)
diff --git a/clang/test/CodeGen/arm-vfp16-arguments2.cpp b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
index 6221e85e856b4..b7c6852c47b7f 100644
--- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp
+++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
@@ -44,20 +44,20 @@ struct S1 f1(struct S1 s1) { return s1; }
// CHECK-SOFT: define{{.*}} void @_Z2f22S2(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S2) align 8 %agg.result, [4 x i32] %s2.coerce)
// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f22S2([2 x <2 x i32>] returned %s2.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 returned %s2.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 %s2.coerce)
struct S2 f2(struct S2 s2) { return s2; }
// CHECK-SOFT: define{{.*}} void @_Z2f32S3(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S3) align 8 %agg.result, [2 x i64] %s3.coerce)
// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f32S3([2 x <2 x i32>] returned %s3.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 returned %s3.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 %s3.coerce)
struct S3 f3(struct S3 s3) { return s3; }
// CHECK-SOFT: define{{.*}} void @_Z2f42S4(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S4) align 8 %agg.result, [2 x i64] %s4.coerce)
// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f42S4([2 x <2 x i32>] returned %s4.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 returned %s4.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 %s4.coerce)
struct S4 f4(struct S4 s4) { return s4; }
// CHECK-SOFT: define{{.*}} void @_Z2f52S5(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.S5) align 8 %agg.result, [2 x i64] %s5.coerce)
-// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce)
-// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce)
+// CHECK-HARD: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce)
+// CHECK-FULL: define{{.*}} arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 %s5.coerce)
struct S5 f5(struct S5 s5) { return s5; }
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index a5135ab01f0f3..70c86cbb8c3d4 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -16,9 +16,8 @@
// HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce)
-// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
+// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
// OPT: ret void
@@ -28,9 +27,8 @@ __global__ void kernel1(int *x) {
// HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
-// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4
// OPT: ret void
@@ -67,7 +65,7 @@ struct S {
// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8
// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8
// OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1)
-// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD:[0-9]+]]
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
// OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4
// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4
@@ -126,9 +124,8 @@ struct SS {
};
// HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce)
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce)
-// CHECK: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr
-// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4, !amdgpu.noclobber ![[MD]]
+// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4{{$}}
// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
// OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4
// OPT: ret void
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 2e88afac813f4..4bf23e529c7a5 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -17,17 +17,16 @@
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DISPATCH_PTR]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT: store ptr [[TMP2]], ptr [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_dispatch_ptr(int* out) {
@@ -43,17 +42,16 @@ __global__ void use_dispatch_ptr(int* out) {
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[QUEUE_PTR]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT: store ptr [[TMP2]], ptr [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_queue_ptr(int* out) {
@@ -69,17 +67,16 @@ __global__ void use_queue_ptr(int* out) {
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMPLICITARG_PTR]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT: store ptr [[TMP2]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_implicitarg_ptr(int* out) {
@@ -134,16 +131,15 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT: store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin(float src, float *shared) {
@@ -184,17 +180,16 @@ __global__ void endpgm() {
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35)
-// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[TMP2]], ptr [[TMP3]], align 8
// CHECK-NEXT: ret void
//
__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
@@ -210,13 +205,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[TMP0]], ptr [[TMP1]], align 8
// CHECK-NEXT: ret void
//
__global__ void test_s_memtime(unsigned long long* out)
@@ -237,18 +231,17 @@ __device__ void func(float *x);
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT: store volatile float [[TMP3]], ptr [[X_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP4]]) #[[ATTR7:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
@@ -264,14 +257,13 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
-// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT: store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP0]])
+// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8
+// CHECK-NEXT: store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1
// CHECK-NEXT: ret void
//
__global__ void test_is_shared(float *x){
@@ -286,14 +278,13 @@ __global__ void test_is_shared(float *x){
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr
-// CHECK-NEXT: store ptr [[TMP0]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
-// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT: store i8 [[FROMBOOL]], ptr [[RET_ASCAST]], align 1
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP0]])
+// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP1]] to i8
+// CHECK-NEXT: store i8 [[STOREDV]], ptr [[RET_ASCAST]], align 1
// CHECK-NEXT: ret void
//
__global__ void test_is_private(int *x){
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 32851805298f1..1cbe358910b85 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -17,16 +17,15 @@
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_dispatch_ptr(int* out) {
@@ -42,16 +41,15 @@ __global__ void use_dispatch_ptr(int* out) {
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_queue_ptr(int* out) {
@@ -67,16 +65,15 @@ __global__ void use_queue_ptr(int* out) {
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__global__ void use_implicitarg_ptr(int* out) {
@@ -131,16 +128,15 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT: store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin(float src, float *shared) {
@@ -175,17 +171,16 @@ __global__ void endpgm() {
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35)
-// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i64 [[TMP3]], ptr addrspace(4) [[TMP4]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
// CHECK-NEXT: ret void
//
__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
@@ -201,13 +196,12 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8
// CHECK-NEXT: ret void
//
__global__ void test_s_memtime(unsigned long long* out)
@@ -228,18 +222,17 @@ __device__ void func(float *x);
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
-// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// CHECK-NEXT: store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR6:[0-9]+]]
// CHECK-NEXT: ret void
//
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
@@ -255,15 +248,14 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP2]])
-// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8
-// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
+// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
// CHECK-NEXT: ret void
//
__global__ void test_is_shared(float *x){
@@ -278,15 +270,14 @@ __global__ void test_is_shared(float *x){
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr
-// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP2]])
-// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8
-// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// CHECK-NEXT: [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
+// CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// CHECK-NEXT: store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
// CHECK-NEXT: ret void
//
__global__ void test_is_private(int *x){
diff --git a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
index 7279b6c7f23a0..1ad46042b6efd 100644
--- a/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
+++ b/clang/test/CodeGenCXX/address-space-cast-coerce.cpp
@@ -46,9 +46,9 @@ int mane() {
char1 f1{1};
char1 f2{1};
-// CHECK: [[TMP:%.+]] = alloca i16
-// CHECK: [[COERCE:%.+]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %{{.+}}, ptr align 2 [[COERCE]], i64 1, i1 false)
+// CHECK: [[CALL:%.*]] = call i16
+// CHECK: [[TRUNC:%.*]] = trunc i16 [[CALL]] to i8
+// CHECK: store i8 [[TRUNC]]
char1 f3 = f1 + f2;
}
diff --git a/clang/test/CodeGenCXX/cxx2a-consteval.cpp b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
index 075cab58358ab..6c09053a74d24 100644
--- a/clang/test/CodeGenCXX/cxx2a-consteval.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-consteval.cpp
@@ -1,4 +1,3 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -emit-llvm %s -std=c++2a -triple x86_64-unknown-linux-gnu -o %t.ll
// RUN: FileCheck -check-prefix=EVAL -input-file=%t.ll %s
// RUN: FileCheck -check-prefix=EVAL-STATIC -input-file=%t.ll %s
@@ -275,3 +274,26 @@ void f() {
// EVAL-FN: call void @_ZN7GH821542S3C2Ei
}
}
+
+namespace GH93040 {
+struct C { char c = 1; };
+struct Empty { consteval Empty() {} };
+struct Empty2 { consteval Empty2() {} };
+struct Test : C, Empty {
+ [[no_unique_address]] Empty2 e;
+};
+static_assert(sizeof(Test) == 1);
+void f() {
+ Test test;
+
+// Make sure we don't overwrite the initialization of c.
+
+// EVAL-FN-LABEL: define {{.*}} void @_ZN7GH930404TestC2Ev
+// EVAL-FN: entry:
+// EVAL-FN-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// EVAL-FN-NEXT: store ptr {{.*}}, ptr [[THIS_ADDR]], align 8
+// EVAL-FN-NEXT: [[THIS:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// EVAL-FN-NEXT: call void @_ZN7GH930401CC2Ev(ptr noundef nonnull align 1 dereferenceable(1) [[THIS]])
+// EVAL-FN-NEXT: ret void
+}
+}
diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp
index 3012b0f2bc33d..54912a617c287 100644
--- a/clang/test/CodeGenCXX/trivial_abi.cpp
+++ b/clang/test/CodeGenCXX/trivial_abi.cpp
@@ -262,6 +262,26 @@ void testExceptionLarge() {
calleeExceptionLarge(Large(), Large());
}
+// CHECK: define void @_ZN7GH930401gEPNS_1SE
+// CHECK: [[CALL:%.*]] = call i64 @_ZN7GH930401fEv
+// CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[CALL]] to i56
+// CHECK-NEXT: store i56 [[TRUNC]]
+// CHECK-NEXT: ret void
+void* operator new(unsigned long, void*);
+namespace GH93040 {
+struct [[clang::trivial_abi]] S {
+ char a;
+ int x;
+ __attribute((aligned(2))) char y;
+ S();
+} __attribute((packed));
+S f();
+void g(S* s) { new(s) S(f()); }
+struct S2 { [[no_unique_address]] S s; char c;};
+static_assert(sizeof(S) == 8 && sizeof(S2) == 8, "");
+}
+
+
// PR42961
// CHECK: define{{.*}} @"_ZN3$_08__invokeEv"()
diff --git a/clang/test/CodeGenHIP/dpp-const-fold.hip b/clang/test/CodeGenHIP/dpp-const-fold.hip
index f5a97c6b0e77f..c5450ec4b8413 100644
--- a/clang/test/CodeGenHIP/dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/dpp-const-fold.hip
@@ -22,25 +22,25 @@ constexpr static bool BountCtrl()
return true & false;
}
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
}
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
}
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
}
-// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
index 2b785200e8eea..71270bc1c68d8 100644
--- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
@@ -21,25 +21,25 @@ constexpr static bool BountCtrl()
return true & false;
}
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
}
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
}
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
}
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
__attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
{
*out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 385f8a753cd8e..1651cb379a20f 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -145,7 +145,9 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: store %struct.LargeStructOneMember %u.coerce, ptr addrspace(5) %[[U]], align 8
+// AMDGCN: %[[U_ELEM:.*]] = getelementptr inbounds %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0
+// AMDGCN: %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0
+// AMDGCN: store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8
// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
FuncOneLargeMember(u);
@@ -177,7 +179,12 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember
// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]])
// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], ptr addrspace(5) %[[u]]
+// AMDGCN: %[[U_PTR0:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0
+// AMDGCN: %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0
+// AMDGCN: store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]]
+// AMDGCN: %[[U_PTR1:.*]] = getelementptr inbounds %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1
+// AMDGCN: %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1
+// AMDGCN: store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]]
// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
FuncLargeTwoMember(u);
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index fa83a38a01b0a..fe0a2f9578db0 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -61,7 +61,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
// the return value.
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -74,7 +74,7 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
-// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @foo([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
@@ -98,7 +98,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !7 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -168,7 +168,7 @@ void test_indirect_arg_globl(void) {
#endif
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local
-// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space !9 !kernel_arg_access_qual !9 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !9 {
+// AMDGCN-SAME: () #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
@@ -193,7 +193,7 @@ void test_indirect_arg_private(void) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember
-// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: (<2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
@@ -208,7 +208,7 @@ kernel void KernelOneMember(struct StructOneMember u) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir
-// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !14 !kernel_arg_access_qual !11 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: (ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
@@ -223,10 +223,12 @@ kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember
-// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
// AMDGCN-NEXT: ret void
//
@@ -271,15 +273,20 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember
-// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT: store [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR3]]
+// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
// AMDGCN-NEXT: ret void
//
kernel void KernelTwoMember(struct StructTwoMember u) {
@@ -287,10 +294,15 @@ kernel void KernelTwoMember(struct StructTwoMember u) {
}
// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember
-// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space !10 !kernel_arg_access_qual !11 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !13 {
+// AMDGCN-SAME: ([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
// AMDGCN-NEXT: entry:
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
// AMDGCN-NEXT: ret void
//
More information about the llvm-branch-commits
mailing list