[llvm-branch-commits] [clang] e068f99 - Revert "[clang] remove lots of "innocuous" addrspacecasts (#197745)"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue May 26 06:59:49 PDT 2026
Author: Jan Patrick Lehr
Date: 2026-05-26T15:59:42+02:00
New Revision: e068f993899e94355b5ca734de052319bd927f49
URL: https://github.com/llvm/llvm-project/commit/e068f993899e94355b5ca734de052319bd927f49
DIFF: https://github.com/llvm/llvm-project/commit/e068f993899e94355b5ca734de052319bd927f49.diff
LOG: Revert "[clang] remove lots of "innocuous" addrspacecasts (#197745)"
This reverts commit 2825dfa027e62693753593a8e80511ea88fea6c1.
Added:
Modified:
clang/lib/CodeGen/ABIInfoImpl.cpp
clang/lib/CodeGen/CGAtomic.cpp
clang/lib/CodeGen/CGBlocks.cpp
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/CodeGen/CGCUDANV.cpp
clang/lib/CodeGen/CGCall.cpp
clang/lib/CodeGen/CGClass.cpp
clang/lib/CodeGen/CGDecl.cpp
clang/lib/CodeGen/CGException.cpp
clang/lib/CodeGen/CGExpr.cpp
clang/lib/CodeGen/CGExprAgg.cpp
clang/lib/CodeGen/CGHLSLRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CGStmt.cpp
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/lib/CodeGen/MicrosoftCXXABI.cpp
clang/lib/CodeGen/TargetBuiltins/ARM.cpp
clang/lib/CodeGen/TargetBuiltins/X86.cpp
clang/lib/CodeGen/Targets/X86.cpp
clang/test/CodeGen/scoped-atomic-ops.c
clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
clang/test/CodeGenCUDA/atomic-options.hip
clang/test/CodeGenCUDA/builtins-amdgcn.cu
clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
clang/test/CodeGenCUDA/record-layout.cu
clang/test/CodeGenCXX/amdgcn-func-arg.cpp
clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
clang/test/CodeGenHIP/placement-new-addrspace.hip
clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
clang/test/CodeGenOpenCL/atomic-ops.cl
clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
clang/test/OpenMP/amdgcn_target_device_vla.cpp
clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp
clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 887a645a4783a..8ad77ee515d49 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -442,7 +442,7 @@ Address CodeGen::EmitVAArgInstr(CodeGenFunction &CGF, Address VAListAddr,
assert(!AI.getCoerceToType() &&
"Unexpected CoerceToType seen in arginfo in generic VAArg emitter!");
- Address Temp = CGF.CreateMemTempWithoutCast(Ty, "varet");
+ Address Temp = CGF.CreateMemTemp(Ty, "varet");
Val = CGF.Builder.CreateVAArg(VAListAddr.emitRawPointer(CGF),
CGF.ConvertTypeForMem(Ty));
CGF.Builder.CreateStore(Val, Temp);
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 270965b109943..b4fd0fdb795aa 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -304,7 +304,7 @@ Address AtomicInfo::CreateTempAlloca() const {
? ValueTy
: AtomicTy.getUnqualifiedType();
Address TempAlloca =
- CGF.CreateMemTempWithoutCast(TmpTy, getAtomicAlignment(), "atomic-temp");
+ CGF.CreateMemTemp(TmpTy, getAtomicAlignment(), "atomic-temp");
// Cast to pointer to value type for bitfields.
if (LVal.isBitField())
return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
@@ -826,7 +826,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
// into a temporary alloca.
static Address
EmitValToTemp(CodeGenFunction &CGF, Expr *E) {
- Address DeclPtr = CGF.CreateMemTempWithoutCast(E->getType(), ".atomictmp");
+ Address DeclPtr = CGF.CreateMemTemp(E->getType(), ".atomictmp");
CGF.EmitAnyExprToMem(E, DeclPtr, E->getType().getQualifiers(),
/*Init*/ true);
return DeclPtr;
@@ -1025,7 +1025,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
CharUnits PointeeIncAmt =
getContext().getTypeSizeInChars(MemTy->getPointeeType());
Val1Scalar = Builder.CreateMul(Val1Scalar, CGM.getSize(PointeeIncAmt));
- auto Temp = CreateMemTempWithoutCast(Val1Ty, ".atomictmp");
+ auto Temp = CreateMemTemp(Val1Ty, ".atomictmp");
Val1 = Temp;
EmitStoreOfScalar(Val1Scalar, MakeAddrLValue(Temp, Val1Ty));
break;
@@ -1121,7 +1121,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
if (ShouldCastToIntPtrTy)
Dest = Atomics.castToAtomicIntPointer(Dest);
} else if (E->isCmpXChg())
- Dest = CreateMemTempWithoutCast(RValTy, "cmpxchg.bool");
+ Dest = CreateMemTemp(RValTy, "cmpxchg.bool");
else if (!RValTy->isVoidType()) {
Dest = Atomics.CreateTempAlloca();
if (ShouldCastToIntPtrTy)
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 0683a4937cf37..1ce22df11e6a7 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1418,8 +1418,7 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D,
// Allocate a stack slot like for any local variable to guarantee optimal
// debug info at -O0. The mem2reg pass will eliminate it when optimizing.
- RawAddress alloc =
- CreateMemTempWithoutCast(D->getType(), D->getName() + ".addr");
+ RawAddress alloc = CreateMemTemp(D->getType(), D->getName() + ".addr");
Builder.CreateStore(arg, alloc);
if (CGDebugInfo *DI = getDebugInfo()) {
if (CGM.getCodeGenOpts().hasReducedDebugInfo()) {
@@ -1558,8 +1557,8 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
if (!capture.isConstant()) continue;
CharUnits align = getContext().getDeclAlign(variable);
- Address alloca = CreateMemTempWithoutCast(variable->getType(), align,
- "block.captured-const");
+ Address alloca =
+ CreateMemTemp(variable->getType(), align, "block.captured-const");
Builder.CreateStore(capture.getConstant(), alloca);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b1d727cb5e0ad..4a59c6560ef26 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2271,9 +2271,10 @@ RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
if (!isa<Constant>(ArgVal)) {
CleanupKind Cleanup = getARCCleanupKind();
QualType Ty = TheExpr->getType();
- RawAddress Alloca = CreateMemTempWithoutCast(Ty, "os.log.arg");
+ RawAddress Alloca = RawAddress::invalid();
+ RawAddress Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca);
ArgVal = EmitARCRetain(Ty, ArgVal);
- Builder.CreateStore(ArgVal, Alloca);
+ Builder.CreateStore(ArgVal, Addr);
pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty,
CodeGenFunction::destroyARCStrongPrecise,
Cleanup & EHCleanup);
@@ -6345,8 +6346,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
getContext().getSizeType(), ArraySize, nullptr,
ArraySizeModifier::Normal,
/*IndexTypeQuals=*/0);
- auto Tmp = CreateMemTempWithoutCast(SizeArrayTy, "block_sizes");
- llvm::Value *Alloca = Tmp.getPointer();
+ auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
+ llvm::Value *TmpPtr = Tmp.getPointer();
+ // The EmitLifetime* pair expect a naked Alloca as their last argument,
+ // however for cases where the default AS is not the Alloca AS, Tmp is
+ // actually the Alloca ascasted to the default AS, hence the
+ // stripPointerCasts()
+ llvm::Value *Alloca = TmpPtr->stripPointerCasts();
llvm::Value *ElemPtr;
EmitLifetimeStart(Alloca);
// Each of the following arguments specifies the size of the corresponding
@@ -6363,6 +6369,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
Builder.CreateAlignedStore(
V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
}
+ // Return the Alloca itself rather than a potential ascast as this is only
+ // used by the paired EmitLifetimeEnd.
return {ElemPtr, Alloca};
};
@@ -6793,7 +6801,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
// always just emit into it.
TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
- Address DestPtr = CreateMemTempWithoutCast(E->getType(), "agg.tmp");
+ Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
ReturnValue = ReturnValueSlot(DestPtr, false);
}
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 259b6c040706b..3eda4237b0549 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -433,10 +433,10 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
// Create temporary dim3 grid_dim, block_dim.
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
QualType Dim3Ty = GridDimParam->getType();
- Address GridDim = CGF.CreateMemTempWithoutCast(
- Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
- Address BlockDim = CGF.CreateMemTempWithoutCast(
- Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
+ Address GridDim =
+ CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
+ Address BlockDim =
+ CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
Address ShmemSize = CGF.CreateTempAlloca(SizeTy, LangAS::Default,
CGM.getSizeAlign(), "shmem_size");
Address Stream = CGF.CreateTempAlloca(PtrTy, LangAS::Default,
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 40cc275d40273..2468394929360 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3373,7 +3373,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
// may be aliased, copy it to ensure that the parameter variable is
// mutable and has a unique adress, as C requires.
if (ArgI.getIndirectRealign() || ArgI.isIndirectAliased()) {
- RawAddress AlignedTemp = CreateMemTempWithoutCast(Ty, "coerce");
+ RawAddress AlignedTemp = CreateMemTemp(Ty, "coerce");
// Copy from the incoming argument pointer to the temporary with the
// appropriate alignment.
@@ -3503,8 +3503,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
ParameterABI::SwiftErrorResult) {
QualType pointeeTy = Ty->getPointeeType();
assert(pointeeTy->isPointerType());
- RawAddress temp = CreateMemTempWithoutCast(
- pointeeTy, getPointerAlign(), "swifterror.temp");
+ RawAddress temp =
+ CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp");
Address arg = makeNaturalAddressForPointer(
V, pointeeTy, getContext().getTypeAlignInChars(pointeeTy));
llvm::Value *incomingErrorValue = Builder.CreateLoad(arg);
@@ -3556,8 +3556,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
llvm::StructType *STy =
dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
- Address Alloca = CreateMemTempWithoutCast(
- Ty, getContext().getDeclAlign(Arg), Arg->getName());
+ Address Alloca =
+ CreateMemTemp(Ty, getContext().getDeclAlign(Arg), Arg->getName());
// Pointer to store into.
Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
@@ -3646,8 +3646,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
case ABIArgInfo::CoerceAndExpand: {
// Reconstruct into a temporary.
- Address alloca =
- CreateMemTempWithoutCast(Ty, getContext().getDeclAlign(Arg));
+ Address alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg));
ArgVals.push_back(ParamValue::forIndirect(alloca));
auto coercionType = ArgI.getCoerceAndExpandType();
@@ -3688,8 +3687,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
// If this structure was expanded into multiple arguments then
// we need to create a temporary and reconstruct it from the
// arguments.
- Address Alloca =
- CreateMemTempWithoutCast(Ty, getContext().getDeclAlign(Arg));
+ Address Alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg));
LValue LV = MakeAddrLValue(Alloca, Ty);
ArgVals.push_back(ParamValue::forIndirect(Alloca));
@@ -3706,8 +3704,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
case ABIArgInfo::TargetSpecific: {
auto *AI = Fn->getArg(FirstIRArg);
AI->setName(Arg->getName() + ".target_coerce");
- Address Alloca = CreateMemTempWithoutCast(
- Ty, getContext().getDeclAlign(Arg), Arg->getName());
+ Address Alloca =
+ CreateMemTemp(Ty, getContext().getDeclAlign(Arg), Arg->getName());
Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
CGM.getABIInfo().createCoercedStore(AI, Ptr, ArgI, false, *this);
if (CodeGenFunction::hasScalarEvaluationKind(Ty)) {
@@ -3726,8 +3724,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
assert(NumIRArgs == 0);
// Initialize the local variable appropriately.
if (!hasScalarEvaluationKind(Ty)) {
- ArgVals.push_back(
- ParamValue::forIndirect(CreateMemTempWithoutCast(Ty)));
+ ArgVals.push_back(ParamValue::forIndirect(CreateMemTemp(Ty)));
} else {
llvm::Value *U = llvm::UndefValue::get(ConvertType(Arg->getType()));
ArgVals.push_back(ParamValue::forDirect(U));
@@ -5034,7 +5031,7 @@ struct DestroyUnpassedArg final : EHScopeStack::Cleanup {
RValue CallArg::getRValue(CodeGenFunction &CGF) const {
if (!HasLV)
return RV;
- LValue Copy = CGF.MakeAddrLValue(CGF.CreateMemTempWithoutCast(Ty), Ty);
+ LValue Copy = CGF.MakeAddrLValue(CGF.CreateMemTemp(Ty), Ty);
CGF.EmitAggregateCopy(Copy, LV, Ty, AggValueSlot::DoesNotOverlap,
LV.isVolatile());
IsUsed = true;
@@ -5613,8 +5610,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
// For indirect things such as overaligned structs, replace the
// placeholder with a regular aggregate temporary alloca. Store the
// address of this alloca into the struct.
- Addr =
- CreateMemTempWithoutCast(info_it->type, "inalloca.indirect.tmp");
+ Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp");
Address ArgSlot = Builder.CreateStructGEP(
ArgMemory, ArgInfo.getInAllocaFieldIndex());
Builder.CreateStore(Addr.getPointer(), ArgSlot);
@@ -5759,8 +5755,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
swiftErrorArg = makeNaturalAddressForPointer(
V, pointeeTy, getContext().getTypeAlignInChars(pointeeTy));
- swiftErrorTemp = CreateMemTempWithoutCast(
- pointeeTy, getPointerAlign(), "swifterror.temp");
+ swiftErrorTemp =
+ CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp");
V = swiftErrorTemp.getPointer();
cast<llvm::AllocaInst>(V)->setSwiftError(true);
@@ -5795,7 +5791,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
// FIXME: Avoid the conversion through memory if possible.
Address Src = Address::invalid();
if (!I->isAggregate()) {
- Src = CreateMemTempWithoutCast(I->Ty, "coerce");
+ Src = CreateMemTemp(I->Ty, "coerce");
I->copyInto(*this, Src);
} else {
Src = I->hasLValue() ? I->getKnownLValue().getAddress()
@@ -5952,7 +5948,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
case ABIArgInfo::TargetSpecific: {
Address Src = Address::invalid();
if (!I->isAggregate()) {
- Src = CreateMemTempWithoutCast(I->Ty, "target_coerce");
+ Src = CreateMemTemp(I->Ty, "target_coerce");
I->copyInto(*this, Src);
} else {
Src = I->hasLValue() ? I->getKnownLValue().getAddress()
@@ -6488,7 +6484,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
getContext().getTypeInfoDataSizeInChars(RetTy).Width.getQuantity();
if (!DestPtr.isValid()) {
- DestPtr = CreateMemTempWithoutCast(RetTy, "coerce");
+ DestPtr = CreateMemTemp(RetTy, "coerce");
DestIsVolatile = false;
DestSize = getContext().getTypeSizeInChars(RetTy).getQuantity();
}
@@ -6513,7 +6509,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
bool DestIsVolatile = ReturnValue.isVolatile();
if (!DestPtr.isValid()) {
- DestPtr = CreateMemTempWithoutCast(RetTy, "target_coerce");
+ DestPtr = CreateMemTemp(RetTy, "target_coerce");
DestIsVolatile = false;
}
CGM.getABIInfo().createCoercedStore(CI, StorePtr, RetAI, DestIsVolatile,
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index e52c5f6af2851..de11e8bca43f1 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -3190,7 +3190,7 @@ void CodeGenFunction::EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD) {
CanQualType LambdaType = getContext().getCanonicalTagType(Lambda);
CanQualType ThisType = getContext().getPointerType(LambdaType);
- Address ThisPtr = CreateMemTempWithoutCast(LambdaType, "unused.capture");
+ Address ThisPtr = CreateMemTemp(LambdaType, "unused.capture");
CallArgs.add(RValue::get(ThisPtr.emitRawPointer(*this)), ThisType);
EmitLambdaDelegatingInvokeBody(MD, CallArgs);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 7608f8cb6fc7a..63ad0bc9ec238 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2724,9 +2724,8 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
if (UseIndirectDebugAddress) {
auto PtrTy = getContext().getPointerType(Ty);
- AllocaPtr = CreateMemTempWithoutCast(
- PtrTy, getContext().getTypeAlignInChars(PtrTy),
- D.getName() + ".indirect_addr");
+ AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
+ D.getName() + ".indirect_addr");
EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
}
@@ -2763,7 +2762,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
DeclPtr = OpenMPLocalAddr;
AllocaPtr = DeclPtr;
} else {
- // Otherwise, create a casted temporary to hold the value.
+ // Otherwise, create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr", &AllocaPtr);
}
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index 99dfaa80be429..0576582d34543 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -2130,7 +2130,7 @@ void CodeGenFunction::EmitSEHExceptionCodeSave(CodeGenFunction &ParentCGF,
// On Win64, the info is passed as the first parameter to the filter.
SEHInfo = &*CurFn->arg_begin();
SEHCodeSlotStack.push_back(
- CreateMemTempWithoutCast(getContext().IntTy, "__exception_code"));
+ CreateMemTemp(getContext().IntTy, "__exception_code"));
} else {
// On Win32, the EBP on entry to the filter points to the end of an
// exception registration object. It contains 6 32-bit fields, and the info
@@ -2204,7 +2204,7 @@ void CodeGenFunction::EnterSEHTryStmt(const SEHTryStmt &S) {
assert(Except);
EHCatchScope *CatchScope = EHStack.pushCatch(1);
SEHCodeSlotStack.push_back(
- CreateMemTempWithoutCast(getContext().IntTy, "__exception_code"));
+ CreateMemTemp(getContext().IntTy, "__exception_code"));
// If the filter is known to evaluate to 1, then we can use the clause
// "catch i8* null". We can't do this on x86 because the filter has to save
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 77fd47ed42f03..8ca4ee64136c8 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -500,10 +500,7 @@ static RawAddress createReferenceTemporary(CodeGenFunction &CGF,
// FIXME: Should we put the new global into a COMDAT?
return RawAddress(C, GV->getValueType(), alignment);
}
- RawAddress Addr = CGF.CreateMemTempWithoutCast(Ty, "ref.tmp");
- if (Alloca)
- *Alloca = Addr;
- return Addr;
+ return CGF.CreateMemTemp(Ty, "ref.tmp", Alloca);
}
case SD_Thread:
case SD_Static:
@@ -1630,7 +1627,7 @@ RValue CodeGenFunction::GetUndefRValue(QualType Ty) {
// identifiable address. Just because the contents of the value are undefined
// doesn't mean that the address can't be taken and compared.
case TEK_Aggregate: {
- Address DestPtr = CreateMemTempWithoutCast(Ty, "undef.agg.tmp");
+ Address DestPtr = CreateMemTemp(Ty, "undef.agg.tmp");
return RValue::getAggregate(DestPtr);
}
@@ -5970,7 +5967,7 @@ LValue CodeGenFunction::EmitCompoundLiteralLValue(const CompoundLiteralExpr *E){
// make sure to emit the VLA size.
EmitVariablyModifiedType(E->getType());
- Address DeclPtr = CreateMemTempWithoutCast(E->getType(), ".compoundliteral");
+ Address DeclPtr = CreateMemTemp(E->getType(), ".compoundliteral");
const Expr *InitExpr = E->getInitializer();
LValue Result = MakeAddrLValue(DeclPtr, E->getType(), AlignmentSource::Decl);
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 876d184af1930..00edf3e99a34e 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -2187,7 +2187,7 @@ void CodeGenFunction::EmitAggExpr(const Expr *E, AggValueSlot Slot) {
LValue CodeGenFunction::EmitAggExprToLValue(const Expr *E) {
assert(hasAggregateEvaluationKind(E->getType()) && "Invalid argument!");
- Address Temp = CreateMemTempWithoutCast(E->getType());
+ Address Temp = CreateMemTemp(E->getType());
LValue LV = MakeAddrLValue(Temp, E->getType());
EmitAggExpr(E, AggValueSlot::forLValue(LV, AggValueSlot::IsNotDestructed,
AggValueSlot::DoesNotNeedGCBarriers,
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 98d65715d45b9..a134f6aab9490 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -1387,7 +1387,7 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
// Create a temporary variable for the result, which is either going
// to be a single resource instance or a local array of resources (we need to
// return an LValue).
- RawAddress TmpVar = CGF.CreateMemTempWithoutCast(ResultTy);
+ RawAddress TmpVar = CGF.CreateMemTemp(ResultTy);
if (CGF.EmitLifetimeStart(TmpVar.getPointer()))
CGF.pushFullExprCleanup<CodeGenFunction::CallLifetimeEnd>(
NormalEHLifetimeMarker, TmpVar);
@@ -1489,14 +1489,19 @@ RawAddress CGHLSLRuntime::createBufferMatrixTempAddress(const LValue &LV,
"expected cbuffer matrix");
QualType MatQualTy = LV.getType();
+ llvm::Type *MemTy = CGF.ConvertTypeForMem(MatQualTy);
llvm::Type *LayoutTy = HLSLBufferLayoutBuilder(CGF.CGM).layOutType(MatQualTy);
- Address SrcAddr = LV.getAddress();
- if (LayoutTy == CGF.ConvertTypeForMem(MatQualTy))
- return SrcAddr;
+ if (LayoutTy == MemTy)
+ return LV.getAddress();
- RawAddress DestAlloca =
- CGF.CreateMemTempWithoutCast(MatQualTy, "matrix.buf.copy");
+ Address SrcAddr = LV.getAddress();
+ // NOTE: B\C CreateMemTemp flattens MatrixTypes which causes
+ // overlapping GEPs in emitBufferCopy. Use CreateTempAlloca with
+ // the non-padded layout.
+ CharUnits Align =
+ CharUnits::fromQuantity(CGF.CGM.getDataLayout().getABITypeAlign(MemTy));
+ RawAddress DestAlloca = CGF.CreateTempAlloca(MemTy, Align, "matrix.buf.copy");
emitBufferCopy(CGF, DestAlloca, SrcAddr, MatQualTy);
return DestAlloca;
}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 9548efbeb72cf..ec059f9dfef82 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -919,7 +919,7 @@ static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
BaseTy = BaseTy.getNonReferenceType();
while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
!CGF.getContext().hasSameType(BaseTy, ElTy)) {
- Tmp = CGF.CreateMemTempWithoutCast(BaseTy);
+ Tmp = CGF.CreateMemTemp(BaseTy);
if (TopTmp.isValid())
CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp);
else
@@ -2059,8 +2059,7 @@ Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
llvm::Value *ThreadID = getThreadID(CGF, Loc);
QualType Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true);
- Address ThreadIDTemp =
- CGF.CreateMemTempWithoutCast(Int32Ty, /*Name*/ ".threadid_temp.");
+ Address ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".threadid_temp.");
CGF.EmitStoreOfScalar(ThreadID,
CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty));
@@ -2339,7 +2338,7 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
// int32 did_it = 0;
QualType KmpInt32Ty =
C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
- DidIt = CGF.CreateMemTempWithoutCast(KmpInt32Ty, ".omp.copyprivate.did_it");
+ DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it");
CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt);
}
// Prepare arguments and build a call to __kmpc_single
@@ -2366,8 +2365,8 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
/*IndexTypeQuals=*/0);
// Create a list of all private variables for copyprivate.
- Address CopyprivateList = CGF.CreateMemTempWithoutCast(
- CopyprivateArrayTy, ".omp.copyprivate.cpr_list");
+ Address CopyprivateList =
+ CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list");
for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) {
Address Elem = CGF.Builder.CreateConstArrayGEP(CopyprivateList, I);
CGF.Builder.CreateStore(
@@ -3994,8 +3993,8 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
KmpTaskAffinityInfoTy,
llvm::APInt(C.getTypeSize(C.getSizeType()), NumAffinities), nullptr,
ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
- AffinitiesArray = CGF.CreateMemTempWithoutCast(KmpTaskAffinityInfoArrayTy,
- ".affs.arr.addr");
+ AffinitiesArray =
+ CGF.CreateMemTemp(KmpTaskAffinityInfoArrayTy, ".affs.arr.addr");
AffinitiesArray = CGF.Builder.CreateConstArrayGEP(AffinitiesArray, 0);
NumOfElements = llvm::ConstantInt::get(CGM.Int32Ty, NumAffinities,
/*isSigned=*/false);
@@ -4032,7 +4031,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
LValue PosLVal;
if (HasIterator) {
PosLVal = CGF.MakeAddrLValue(
- CGF.CreateMemTempWithoutCast(C.getSizeType(), "affs.counter.addr"),
+ CGF.CreateMemTemp(C.getSizeType(), "affs.counter.addr"),
C.getSizeType());
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Pos), PosLVal);
}
@@ -4310,7 +4309,7 @@ SmallVector<llvm::Value *, 4> CGOpenMPRuntime::emitDepobjElementsSizes(
std::tie(NumDeps, Base) =
getDepobjElements(CGF, DepobjLVal, E->getExprLoc());
LValue NumLVal = CGF.MakeAddrLValue(
- CGF.CreateMemTempWithoutCast(C.getUIntPtrType(), "depobj.size.addr"),
+ CGF.CreateMemTemp(C.getUIntPtrType(), "depobj.size.addr"),
C.getUIntPtrType());
CGF.Builder.CreateStore(llvm::ConstantInt::get(CGF.IntPtrTy, 0),
NumLVal.getAddress());
@@ -4455,7 +4454,7 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies), nullptr,
ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
DependenciesArray =
- CGF.CreateMemTempWithoutCast(KmpDependInfoArrayTy, ".dep.arr.addr");
+ CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
DependenciesArray = CGF.Builder.CreateConstArrayGEP(DependenciesArray, 0);
NumOfElements = llvm::ConstantInt::get(CGM.Int32Ty, NumDependencies,
/*isSigned=*/false);
@@ -4468,8 +4467,7 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
}
// Copy regular dependencies with iterators.
LValue PosLVal = CGF.MakeAddrLValue(
- CGF.CreateMemTempWithoutCast(C.getSizeType(), "dep.counter.addr"),
- C.getSizeType());
+ CGF.CreateMemTemp(C.getSizeType(), "dep.counter.addr"), C.getSizeType());
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Pos), PosLVal);
for (const OMPTaskDataTy::DependData &Dep : Dependencies) {
if (Dep.DepKind == OMPC_DEPEND_depobj || !Dep.IteratorExpr)
@@ -4560,7 +4558,7 @@ Address CGOpenMPRuntime::emitDepobjDependClause(
LValue PosLVal;
if (Dependencies.IteratorExpr) {
PosLVal = CGF.MakeAddrLValue(
- CGF.CreateMemTempWithoutCast(C.getSizeType(), "iterator.counter.addr"),
+ CGF.CreateMemTemp(C.getSizeType(), "iterator.counter.addr"),
C.getSizeType());
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Idx), PosLVal,
/*IsInit=*/true);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index cb0e7297f1a89..4e6c2aac0d17a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1374,7 +1374,7 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
// Initialize the counter variable for the loop.
QualType Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
- Address Counter = CGF.CreateMemTempWithoutCast(Int32Ty, "critical_counter");
+ Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
/*isInit=*/true);
@@ -1436,7 +1436,7 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
if (CastTy->isIntegerType() && ValTy->isIntegerType())
return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
CastTy->hasSignedIntegerRepresentation());
- Address CastItem = CGF.CreateMemTempWithoutCast(CastTy);
+ Address CastItem = CGF.CreateMemTemp(CastTy);
Address ValCastItem = CastItem.withElementType(Val->getType());
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
LValueBaseInfo(AlignmentSource::Type),
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index b70667d04d1f6..71f88cdf58954 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -609,7 +609,7 @@ CodeGenFunction::EmitCompoundStmtWithoutScope(const CompoundStmt &S,
// We can't return an RValue here because there might be cleanups at
// the end of the StmtExpr. Because of that, we have to emit the result
// here into a temporary alloca.
- RetAlloca = CreateMemTempWithoutCast(ExprTy);
+ RetAlloca = CreateMemTemp(ExprTy);
EmitAnyExprToMem(E, RetAlloca, Qualifiers(),
/*IsInit*/ false);
}
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 1eaf8efa142c5..82307d3a064c6 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -426,7 +426,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
// and load it as a void pointer.
if (!CurField->getType()->isAnyPointerType()) {
ASTContext &Ctx = getContext();
- Address DstAddr = CreateMemTempWithoutCast(
+ Address DstAddr = CreateMemTemp(
Ctx.getUIntPtrType(),
Twine(CurCap->getCapturedVar()->getName(), ".casted"));
LValue DstLV = MakeAddrLValue(DstAddr, Ctx.getUIntPtrType());
@@ -5352,7 +5352,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
ParamTypes.push_back(PrivatesPtr->getType());
for (const Expr *E : Data.PrivateVars) {
const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
- RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast(
+ RawAddress PrivatePtr = CGF.CreateMemTemp(
CGF.getContext().getPointerType(E->getType()), ".priv.ptr.addr");
PrivatePtrs.emplace_back(VD, PrivatePtr);
CallArgs.push_back(PrivatePtr.getPointer());
@@ -5360,9 +5360,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
}
for (const Expr *E : Data.FirstprivateVars) {
const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
- RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast(
- CGF.getContext().getPointerType(E->getType()),
- ".firstpriv.ptr.addr");
+ RawAddress PrivatePtr =
+ CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()),
+ ".firstpriv.ptr.addr");
PrivatePtrs.emplace_back(VD, PrivatePtr);
FirstprivatePtrs.emplace_back(VD, PrivatePtr);
CallArgs.push_back(PrivatePtr.getPointer());
@@ -5370,9 +5370,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
}
for (const Expr *E : Data.LastprivateVars) {
const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
- RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast(
- CGF.getContext().getPointerType(E->getType()),
- ".lastpriv.ptr.addr");
+ RawAddress PrivatePtr =
+ CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()),
+ ".lastpriv.ptr.addr");
PrivatePtrs.emplace_back(VD, PrivatePtr);
CallArgs.push_back(PrivatePtr.getPointer());
ParamTypes.push_back(PrivatePtr.getType());
@@ -5383,7 +5383,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
Ty = CGF.getContext().getPointerType(Ty);
if (isAllocatableDecl(VD))
Ty = CGF.getContext().getPointerType(Ty);
- RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast(
+ RawAddress PrivatePtr = CGF.CreateMemTemp(
CGF.getContext().getPointerType(Ty), ".local.ptr.addr");
auto Result = UntiedLocalVars.insert(
std::make_pair(VD, std::make_pair(PrivatePtr, Address::invalid())));
@@ -5674,9 +5674,9 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
ParamTypes.push_back(PrivatesPtr->getType());
for (const Expr *E : Data.FirstprivateVars) {
const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
- RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast(
- CGF.getContext().getPointerType(E->getType()),
- ".firstpriv.ptr.addr");
+ RawAddress PrivatePtr =
+ CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()),
+ ".firstpriv.ptr.addr");
PrivatePtrs.emplace_back(VD, PrivatePtr);
CallArgs.push_back(PrivatePtr.getPointer());
ParamTypes.push_back(PrivatePtr.getType());
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 3ce0ef1235561..77ca3e0fee84f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2960,13 +2960,10 @@ class CodeGenFunction : public CodeGenTypeCache {
/// aggregate type.
AggValueSlot CreateAggTemp(QualType T, const Twine &Name = "tmp",
RawAddress *Alloca = nullptr) {
- RawAddress Addr = CreateMemTempWithoutCast(T, Name);
- if (Alloca)
- *Alloca = Addr;
return AggValueSlot::forAddr(
- Addr, T.getQualifiers(), AggValueSlot::IsNotDestructed,
- AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased,
- AggValueSlot::DoesNotOverlap);
+ CreateMemTemp(T.getUnqualifiedType(), Name, Alloca), T.getQualifiers(),
+ AggValueSlot::IsNotDestructed, AggValueSlot::DoesNotNeedGCBarriers,
+ AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap);
}
/// EvaluateExprAsBool - Perform the usual unary conversions on the specified
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 6d8c710c9fe4b..9b444206e8a3d 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -4513,7 +4513,7 @@ void MicrosoftCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) {
QualType ThrowType = SubExpr->getType();
// The exception object lives on the stack and it's address is passed to the
// runtime function.
- Address AI = CGF.CreateMemTempWithoutCast(ThrowType);
+ Address AI = CGF.CreateMemTemp(ThrowType);
CGF.EmitAnyExprToMem(SubExpr, AI, ThrowType.getQualifiers(),
/*IsInit=*/true);
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index f8990ced2a577..161e883b85c2c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -2382,7 +2382,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
: Intrinsic::arm_strexd);
llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
- Address Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType());
+ Address Tmp = CreateMemTemp(E->getArg(0)->getType());
Value *Val = EmitScalarExpr(E->getArg(0));
Builder.CreateStore(Val, Tmp);
@@ -4768,7 +4768,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
: Intrinsic::aarch64_stxp);
llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
- Address Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType());
+ Address Tmp = CreateMemTemp(E->getArg(0)->getType());
EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
Tmp = Tmp.withElementType(STy);
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index ee1ccd83e3aa2..9645ed87b8ef3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1024,14 +1024,14 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
case X86::BI_mm_setcsr:
case X86::BI__builtin_ia32_ldmxcsr: {
- RawAddress Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType());
+ RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType());
Builder.CreateStore(Ops[0], Tmp);
return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
Tmp.getPointer());
}
case X86::BI_mm_getcsr:
case X86::BI__builtin_ia32_stmxcsr: {
- RawAddress Tmp = CreateMemTempWithoutCast(E->getType());
+ RawAddress Tmp = CreateMemTemp(E->getType());
Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
Tmp.getPointer());
return Builder.CreateLoad(Tmp, "stmxcsr");
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 61ab591f55be9..4a57ca7767bd2 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3168,7 +3168,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
// FIXME: Cleanup.
assert(AI.isDirect() && "Unexpected ABI info for mixed regs");
llvm::StructType *ST = cast<llvm::StructType>(AI.getCoerceToType());
- Address Tmp = CGF.CreateMemTempWithoutCast(Ty);
+ Address Tmp = CGF.CreateMemTemp(Ty);
Tmp = Tmp.withElementType(ST);
assert(ST->getNumElements() == 2 && "Unexpected ABI info for mixed regs");
llvm::Type *TyLo = ST->getElementType(0);
@@ -3228,7 +3228,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
// The stored size of this structure is smaller than its actual size,
// which may lead to reading past the end of the register save area.
if (CoTy && (AI.getDirectOffset() == 8 || RegSize < TySize)) {
- Address Tmp = CGF.CreateMemTempWithoutCast(Ty);
+ Address Tmp = CGF.CreateMemTemp(Ty);
llvm::Value *Addr =
CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, GpOrFpOffset);
llvm::Value *Src = CGF.Builder.CreateAlignedLoad(CoTy, Addr, TyAlign);
@@ -3247,7 +3247,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
// Copy into a temporary if the type is more aligned than the
// register save area.
if (neededInt && TyAlign.getQuantity() > 8) {
- Address Tmp = CGF.CreateMemTempWithoutCast(Ty);
+ Address Tmp = CGF.CreateMemTemp(Ty);
CGF.Builder.CreateMemCpy(Tmp, RegAddr, TySize, false);
RegAddr = Tmp;
}
@@ -3271,7 +3271,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
? AI.getCoerceToType()
: llvm::StructType::get(CGF.DoubleTy, CGF.DoubleTy);
llvm::Value *V;
- Address Tmp = CGF.CreateMemTempWithoutCast(Ty);
+ Address Tmp = CGF.CreateMemTemp(Ty);
Tmp = Tmp.withElementType(ST);
V = CGF.Builder.CreateLoad(
RegAddrLo.withElementType(ST->getStructElementType(0)));
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index 686190f9ef947..16b2b459e2cb2 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -140,41 +140,47 @@ int fi1a(int *i) {
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[ATOMIC_TEMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr addrspace(5) [[ATOMIC_TEMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[ATOMIC_TEMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
@@ -191,41 +197,47 @@ int fi1a(int *i) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[ATOMIC_TEMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4
// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr addrspace(5) [[ATOMIC_TEMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4
// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
-// AMDGCN_CL_20-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[ATOMIC_TEMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
@@ -481,30 +493,36 @@ void fi2a(int *i) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
// AMDGCN_CL_DEF-NEXT: ret void
//
@@ -518,30 +536,36 @@ void fi2a(int *i) {
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
// AMDGCN_CL_20-NEXT: ret void
//
@@ -663,6 +687,22 @@ void fi2b(int *i) {
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -672,67 +712,67 @@ void fi2b(int *i) {
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -764,6 +804,22 @@ void fi2b(int *i) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -773,67 +829,67 @@ void fi2b(int *i) {
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -1086,6 +1142,22 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -1095,67 +1167,67 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -1187,6 +1259,22 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -1196,67 +1284,67 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -1509,6 +1597,22 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -1518,67 +1622,67 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -1610,6 +1714,22 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -1619,67 +1739,67 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -1932,6 +2052,22 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -1941,67 +2077,67 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -2033,6 +2169,22 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -2042,67 +2194,67 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -2355,6 +2507,22 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -2364,67 +2532,67 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -2456,6 +2624,22 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -2465,67 +2649,67 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h)
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -2778,6 +2962,22 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
@@ -2787,67 +2987,67 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -2879,6 +3079,22 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
@@ -2888,67 +3104,67 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
// AMDGCN_CL_20-NEXT: ret void
@@ -3176,6 +3392,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3191,8 +3408,8 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3203,6 +3420,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3219,8 +3437,8 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3296,6 +3514,7 @@ _Bool fi4a(int *i) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3311,8 +3530,8 @@ _Bool fi4a(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3323,6 +3542,7 @@ _Bool fi4a(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3339,8 +3559,8 @@ _Bool fi4a(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3416,6 +3636,7 @@ _Bool fi4b(int *i) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3431,8 +3652,8 @@ _Bool fi4b(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3443,6 +3664,7 @@ _Bool fi4b(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3459,8 +3681,8 @@ _Bool fi4b(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3536,6 +3758,7 @@ _Bool fi4c(int *i) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3551,8 +3774,8 @@ _Bool fi4c(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3563,6 +3786,7 @@ _Bool fi4c(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3579,8 +3803,8 @@ _Bool fi4c(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3656,6 +3880,7 @@ _Bool fi4_clustr(int *i) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3671,8 +3896,8 @@ _Bool fi4_clustr(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3683,6 +3908,7 @@ _Bool fi4_clustr(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3699,8 +3925,8 @@ _Bool fi4_clustr(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3776,6 +4002,7 @@ _Bool fi4d(int *i) {
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
@@ -3791,8 +4018,8 @@ _Bool fi4d(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3803,6 +4030,7 @@ _Bool fi4d(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
@@ -3819,8 +4047,8 @@ _Bool fi4d(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -3895,12 +4123,14 @@ _Bool fi4e(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -3910,8 +4140,8 @@ _Bool fi4e(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -3922,12 +4152,14 @@ _Bool fi4e(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -3937,8 +4169,8 @@ _Bool fi4e(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -4012,12 +4244,14 @@ _Bool fi5a(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4027,8 +4261,8 @@ _Bool fi5a(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -4039,12 +4273,14 @@ _Bool fi5a(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4054,8 +4290,8 @@ _Bool fi5a(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -4129,12 +4365,14 @@ _Bool fi5b(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4144,8 +4382,8 @@ _Bool fi5b(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -4156,12 +4394,14 @@ _Bool fi5b(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4171,8 +4411,8 @@ _Bool fi5b(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -4245,12 +4485,14 @@ _Bool fi5c(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4260,8 +4502,8 @@ _Bool fi5c(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -4272,12 +4514,14 @@ _Bool fi5c(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4287,8 +4531,8 @@ _Bool fi5c(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -4361,12 +4605,14 @@ _Bool fi5_clustr(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4376,8 +4622,8 @@ _Bool fi5_clustr(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -4388,12 +4634,14 @@ _Bool fi5_clustr(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4403,8 +4651,8 @@ _Bool fi5_clustr(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -4477,12 +4725,14 @@ _Bool fi5d(int *i) {
// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4492,8 +4742,8 @@ _Bool fi5d(int *i) {
// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -4504,12 +4754,14 @@ _Bool fi5d(int *i) {
// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
@@ -4519,8 +4771,8 @@ _Bool fi5d(int *i) {
// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
-// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5035,13 +5287,15 @@ int fi6e(int *c, int *d) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5051,13 +5305,15 @@ int fi6e(int *c, int *d) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5105,13 +5361,15 @@ _Bool fi7a(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5121,13 +5379,15 @@ _Bool fi7a(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5175,13 +5435,15 @@ _Bool fi7b(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5191,13 +5453,15 @@ _Bool fi7b(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5245,13 +5509,15 @@ _Bool fi7c(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5261,13 +5527,15 @@ _Bool fi7c(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5315,13 +5583,15 @@ _Bool fi7_clustr(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5331,13 +5601,15 @@ _Bool fi7_clustr(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5385,13 +5657,15 @@ _Bool fi7d(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
//
@@ -5401,13 +5675,15 @@ _Bool fi7d(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0
// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
//
@@ -5459,22 +5735,26 @@ _Bool fi7e(_Bool *c) {
// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
-// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_DEF-NEXT: ret void
@@ -5488,22 +5768,26 @@ _Bool fi7e(_Bool *c) {
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
-// AMDGCN_CL_20-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8
// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
// AMDGCN_CL_20-NEXT: ret void
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index 092d509b292fc..bf45a353851b4 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -20,9 +20,10 @@
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0
@@ -32,13 +33,14 @@
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META4:![0-9]+]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8
-// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0
@@ -56,7 +58,7 @@
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META4:![0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -88,26 +90,28 @@ __global__ void kernel1(int *x) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]]
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8
-// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
-// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META5:![0-9]+]]
+// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META6:![0-9]+]]
// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4
@@ -122,7 +126,7 @@ __global__ void kernel1(int *x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -167,7 +171,7 @@ __global__ void kernel2(int &x) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8
// CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -191,7 +195,7 @@ __global__ void kernel2(int &x) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly captures(none) [[X:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly captures(none) [[X:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
// OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
@@ -257,7 +261,7 @@ __global__ void kernel3(__attribute__((address_space(2))) int *x,
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_func void @_Z4funcPi(
-// OPT-SPIRV-SAME: ptr addrspace(4) noundef captures(none) [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1]] {
+// OPT-SPIRV-SAME: ptr addrspace(4) noundef captures(none) [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[X]], align 4
// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
@@ -280,16 +284,16 @@ struct S {
// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S(
// CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8, addrspace(5)
-// CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[S]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
-// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr
-// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S_ASCAST]], i32 0, i32 0
+// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_S]], align 8, addrspace(5)
+// CHECK-NEXT: [[S:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[S]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 0
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4
-// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S_ASCAST]], i32 0, i32 1
+// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 1
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Y]], align 8
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
@@ -298,18 +302,18 @@ struct S {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
-// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8
-// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p0.p2.i64(ptr align 8 [[S]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false)
-// CHECK-SPIRV-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S_ASCAST]], i32 0, i32 0
+// CHECK-SPIRV-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_S]], align 8
+// CHECK-SPIRV-NEXT: [[S:%.*]] = addrspacecast ptr [[COERCE]] to ptr addrspace(4)
+// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p4.p2.i64(ptr addrspace(4) align 8 [[S]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S]], i32 0, i32 0
// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 0
// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4
// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4
-// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S_ASCAST]], i32 0, i32 1
+// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S]], i32 0, i32 1
// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP3]], i64 0
// CHECK-SPIRV-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX1]], align 4
@@ -320,12 +324,12 @@ struct S {
// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S(
// OPT-SAME: ptr addrspace(4) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
// OPT-NEXT: [[ENTRY:.*:]]
-// OPT-NEXT: [[S_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META3:![0-9]+]]
-// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[S_SROA_0_0_COPYLOAD]] to ptr addrspace(1)
-// OPT-NEXT: [[S_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8
-// OPT-NEXT: [[S_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[S_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META3]]
-// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[S_SROA_2_0_COPYLOAD]] to ptr addrspace(1)
-// OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META3]]
+// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4:![0-9]+]]
+// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1)
+// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8
+// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]]
+// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1)
+// OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]]
// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1
// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[TMP1]], align 4
// OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4
@@ -334,17 +338,17 @@ struct S {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[S_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8
-// OPT-SPIRV-NEXT: [[S_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8
-// OPT-SPIRV-NEXT: [[S_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[S_SROA_2_0__SROA_IDX]], align 8
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[S_SROA_0_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8
+// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8
+// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[COERCE_SROA_2_0__SROA_IDX]], align 8
+// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
-// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[S_SROA_0_0_COPYLOAD]], align 4
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[S_SROA_2_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00
-// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[S_SROA_2_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel41S(
@@ -376,9 +380,10 @@ __global__ void kernel4(struct S s) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[S_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr
// CHECK-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr addrspace(5) [[S]], align 8
-// CHECK-NEXT: [[S1:%.*]] = load ptr, ptr addrspace(5) [[S]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr [[S_ASCAST]], align 8
+// CHECK-NEXT: [[S1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
// CHECK-NEXT: store ptr [[S1]], ptr [[S_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP0]], i32 0, i32 0
@@ -397,13 +402,14 @@ __global__ void kernel4(struct S s) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-SPIRV-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
// CHECK-SPIRV-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr [[S_ADDR]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr [[S]], align 8
-// CHECK-SPIRV-NEXT: [[S1:%.*]] = load ptr addrspace(4), ptr [[S]], align 8
+// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr addrspace(4) [[S_ASCAST]], align 8
+// CHECK-SPIRV-NEXT: [[S1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ASCAST]], align 8
// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[S1]], ptr addrspace(4) [[S_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr addrspace(4) [[TMP0]], i32 0, i32 0
@@ -436,7 +442,7 @@ __global__ void kernel4(struct S s) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -481,17 +487,17 @@ struct T {
// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T(
// CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8, addrspace(5)
-// CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[T]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
-// CHECK-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr
-// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T_ASCAST]], i32 0, i32 0
+// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_T]], align 8, addrspace(5)
+// CHECK-NEXT: [[T:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[T]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X]], i64 0, i64 0
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00
// CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX1]], align 4
-// CHECK-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T_ASCAST]], i32 0, i32 0
+// CHECK-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X2]], i64 0, i64 1
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0
@@ -501,19 +507,19 @@ struct T {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
-// CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8
-// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p0.p2.i64(ptr align 8 [[T]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false)
-// CHECK-SPIRV-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T_ASCAST]], i32 0, i32 0
+// CHECK-SPIRV-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_T]], align 8
+// CHECK-SPIRV-NEXT: [[T:%.*]] = addrspacecast ptr [[COERCE]] to ptr addrspace(4)
+// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p4.p2.i64(ptr addrspace(4) align 8 [[T]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T]], i32 0, i32 0
// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X]], i64 0, i64 0
// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP1]], i64 0
// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX1]], align 4
// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00
// CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[ARRAYIDX1]], align 4
-// CHECK-SPIRV-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T_ASCAST]], i32 0, i32 0
+// CHECK-SPIRV-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T]], i32 0, i32 0
// CHECK-SPIRV-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X2]], i64 0, i64 1
// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX3]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP3]], i64 0
@@ -525,12 +531,12 @@ struct T {
// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T(
// OPT-SAME: ptr addrspace(4) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
// OPT-NEXT: [[ENTRY:.*:]]
-// OPT-NEXT: [[T_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META3]]
-// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[T_SROA_0_0_COPYLOAD]] to ptr addrspace(1)
-// OPT-NEXT: [[T_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8
-// OPT-NEXT: [[T_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[T_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META3]]
-// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[T_SROA_2_0_COPYLOAD]] to ptr addrspace(1)
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META3]]
+// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4]]
+// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1)
+// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8
+// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]]
+// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1)
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]]
// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00
// OPT-NEXT: store float [[ADD]], ptr addrspace(1) [[TMP1]], align 4
// OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4
@@ -539,17 +545,17 @@ struct T {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
-// OPT-SPIRV-NEXT: [[T_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8
-// OPT-SPIRV-NEXT: [[T_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8
-// OPT-SPIRV-NEXT: [[T_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[T_SROA_2_0__SROA_IDX]], align 8
-// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[T_SROA_0_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8
+// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8
+// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[COERCE_SROA_2_0__SROA_IDX]], align 8
+// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 1.000000e+00
-// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[T_SROA_0_0_COPYLOAD]], align 4
-// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[T_SROA_2_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: [[ADD5:%.*]] = fadd contract float [[TMP2]], 2.000000e+00
-// OPT-SPIRV-NEXT: store float [[ADD5]], ptr addrspace(4) [[T_SROA_2_0_COPYLOAD]], align 4
+// OPT-SPIRV-NEXT: store float [[ADD5]], ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4
// OPT-SPIRV-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel61T(
@@ -581,9 +587,10 @@ __global__ void kernel6(struct T t) {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0
@@ -593,13 +600,14 @@ __global__ void kernel6(struct T t) {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8
-// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0
@@ -617,7 +625,7 @@ __global__ void kernel6(struct T t) {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -652,10 +660,10 @@ struct SS {
// CHECK-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8, addrspace(5)
-// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(5) [[A]], i32 0, i32 0
-// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
-// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
-// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A_ASCAST]], i32 0, i32 0
+// CHECK-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00
@@ -663,13 +671,13 @@ struct SS {
// CHECK-NEXT: ret void
//
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// CHECK-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
// CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-// CHECK-SPIRV-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A]], i32 0, i32 0
-// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-SPIRV-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
-// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A_ASCAST]], i32 0, i32 0
+// CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-SPIRV-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0
+// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr addrspace(4) [[COERCE_DIVE]], align 8
+// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8
// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4
// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00
@@ -685,7 +693,7 @@ struct SS {
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// OPT-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[A_COERCE]] to i64
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -713,13 +721,13 @@ __global__ void kernel8(struct SS a) {
*a.x += 3.f;
}
//.
-// CHECK: [[META3]] = !{}
-// CHECK: [[META4]] = !{i64 4}
+// CHECK: [[META4]] = !{}
+// CHECK: [[META5]] = !{i64 4}
//.
-// CHECK-SPIRV: [[META4]] = !{i32 1024, i32 1, i32 1}
-// CHECK-SPIRV: [[META5]] = !{i64 4}
+// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
+// CHECK-SPIRV: [[META6]] = !{i64 4}
//.
-// OPT: [[META3]] = !{}
+// OPT: [[META4]] = !{}
//.
-// OPT-SPIRV: [[META4]] = !{i32 1024, i32 1, i32 1}
+// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
//.
diff --git a/clang/test/CodeGenCUDA/atomic-options.hip b/clang/test/CodeGenCUDA/atomic-options.hip
index 7b319516a1010..28ef6c3e8521f 100644
--- a/clang/test/CodeGenCUDA/atomic-options.hip
+++ b/clang/test/CodeGenCUDA/atomic-options.hip
@@ -37,13 +37,15 @@
// DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z12test_defaultPf(
@@ -53,13 +55,15 @@
// OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z12test_defaultPf(
@@ -69,13 +73,15 @@
// SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z12test_defaultPf(
@@ -85,13 +91,15 @@
// SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_default(float *a) {
@@ -120,13 +128,15 @@ __device__ __host__ void test_default(float *a) {
// DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z8test_onePf(
@@ -136,13 +146,15 @@ __device__ __host__ void test_default(float *a) {
// OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z8test_onePf(
@@ -152,13 +164,15 @@ __device__ __host__ void test_default(float *a) {
// SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z8test_onePf(
@@ -168,13 +182,15 @@ __device__ __host__ void test_default(float *a) {
// SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_one(float *a) {
@@ -205,13 +221,15 @@ __device__ __host__ void test_one(float *a) {
// DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z8test_twoPf(
@@ -221,13 +239,15 @@ __device__ __host__ void test_one(float *a) {
// OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z8test_twoPf(
@@ -237,13 +257,15 @@ __device__ __host__ void test_one(float *a) {
// SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z8test_twoPf(
@@ -253,13 +275,15 @@ __device__ __host__ void test_one(float *a) {
// SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_two(float *a) {
@@ -290,13 +314,15 @@ __device__ __host__ void test_two(float *a) {
// DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z10test_threePf(
@@ -306,13 +332,15 @@ __device__ __host__ void test_two(float *a) {
// OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z10test_threePf(
@@ -322,13 +350,15 @@ __device__ __host__ void test_two(float *a) {
// SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z10test_threePf(
@@ -338,13 +368,15 @@ __device__ __host__ void test_two(float *a) {
// SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_three(float *a) {
@@ -375,13 +407,15 @@ __device__ __host__ void test_three(float *a) {
// DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z19test_multiple_attrsPf(
@@ -391,13 +425,15 @@ __device__ __host__ void test_three(float *a) {
// OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z19test_multiple_attrsPf(
@@ -407,13 +443,15 @@ __device__ __host__ void test_three(float *a) {
// SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z19test_multiple_attrsPf(
@@ -423,13 +461,15 @@ __device__ __host__ void test_three(float *a) {
// SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_multiple_attrs(float *a) {
@@ -490,31 +530,39 @@ __device__ __host__ void test_multiple_attrs(float *a) {
// DEV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5)
// DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// DEV-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// DEV-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
// DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
-// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// DEV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 2.000000e+00, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// DEV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// DEV-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// DEV-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// DEV-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4
-// DEV-NEXT: store float [[TMP6]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// DEV-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// DEV-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// DEV-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// DEV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 3.000000e+00, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// DEV-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// DEV-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// DEV-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// DEV-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META3]]
-// DEV-NEXT: store float [[TMP10]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// DEV-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// DEV-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// DEV-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// DEV-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// DEV-NEXT: store float 4.000000e+00, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// DEV-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// DEV-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// DEV-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// DEV-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META3]]
-// DEV-NEXT: store float [[TMP14]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// DEV-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// DEV-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// DEV-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// DEV-NEXT: ret void
//
// OPT-LABEL: define dso_local void @_Z11test_nestedPf(
@@ -530,31 +578,39 @@ __device__ __host__ void test_multiple_attrs(float *a) {
// OPT-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5)
// OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// OPT-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// OPT-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
// OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4
-// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4
+// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4
// OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]]
-// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4
-// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4
+// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4
// OPT-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 2.000000e+00, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
-// OPT-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP1]], align 4
+// OPT-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// OPT-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4
// OPT-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4
-// OPT-NEXT: store float [[TMP6]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
-// OPT-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4
+// OPT-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// OPT-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
// OPT-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 3.000000e+00, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
-// OPT-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP3]], align 4
+// OPT-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// OPT-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4
// OPT-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META3]]
-// OPT-NEXT: store float [[TMP10]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
-// OPT-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4
+// OPT-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// OPT-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
// OPT-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
-// OPT-NEXT: store float 4.000000e+00, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
-// OPT-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP5]], align 4
+// OPT-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// OPT-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4
// OPT-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META3]]
-// OPT-NEXT: store float [[TMP14]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
-// OPT-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4
+// OPT-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// OPT-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
// OPT-NEXT: ret void
//
// SPIRV-DEV-LABEL: define spir_func void @_Z11test_nestedPf(
@@ -570,31 +626,39 @@ __device__ __host__ void test_multiple_attrs(float *a) {
// SPIRV-DEV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4
// SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP1]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP2]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP3]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP4]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP5]] to ptr addrspace(4)
+// SPIRV-DEV-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP6]] to ptr addrspace(4)
// SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4
-// SPIRV-DEV-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-DEV-NEXT: store float 2.000000e+00, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr addrspace(4) [[TMP4]], float [[TMP5]] syncscope("device") seq_cst, align 4
-// SPIRV-DEV-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4
-// SPIRV-DEV-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP6]], ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4
-// SPIRV-DEV-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-DEV-NEXT: store float 3.000000e+00, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr addrspace(4) [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4
-// SPIRV-DEV-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP10]], ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-DEV-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4
-// SPIRV-DEV-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-DEV-NEXT: store float 4.000000e+00, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4
// SPIRV-DEV-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr addrspace(4) [[TMP12]], float [[TMP13]] syncscope("subgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]]
-// SPIRV-DEV-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4
-// SPIRV-DEV-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-DEV-NEXT: store float [[TMP14]], ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4
+// SPIRV-DEV-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4
// SPIRV-DEV-NEXT: ret void
//
// SPIRV-OPT-LABEL: define spir_func void @_Z11test_nestedPf(
@@ -610,31 +674,39 @@ __device__ __host__ void test_multiple_attrs(float *a) {
// SPIRV-OPT-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4
// SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP1]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP2]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP3]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP4]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP5]] to ptr addrspace(4)
+// SPIRV-OPT-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP6]] to ptr addrspace(4)
// SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
-// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4
-// SPIRV-OPT-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-OPT-NEXT: store float 2.000000e+00, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr addrspace(4) [[TMP4]], float [[TMP5]] syncscope("device") seq_cst, align 4
-// SPIRV-OPT-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4
-// SPIRV-OPT-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP6]], ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4
-// SPIRV-OPT-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-OPT-NEXT: store float 3.000000e+00, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr addrspace(4) [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4
-// SPIRV-OPT-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP10]], ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
-// SPIRV-OPT-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4
-// SPIRV-OPT-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-OPT-NEXT: store float 4.000000e+00, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4
// SPIRV-OPT-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr addrspace(4) [[TMP12]], float [[TMP13]] syncscope("subgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]]
-// SPIRV-OPT-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4
-// SPIRV-OPT-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-OPT-NEXT: store float [[TMP14]], ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4
+// SPIRV-OPT-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4
// SPIRV-OPT-NEXT: ret void
//
__device__ __host__ void test_nested(float *a) {
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 35673773ec80c..7edf64db91f2e 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -14,10 +14,11 @@
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DISPATCH_PTR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -38,10 +39,11 @@ __global__ void use_dispatch_ptr(int* out) {
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[QUEUE_PTR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -62,10 +64,11 @@ __global__ void use_queue_ptr(int* out) {
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMPLICITARG_PTR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -124,11 +127,12 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(5) [[SHARED]], align 8
-// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr addrspace(5) [[SHARED]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
@@ -172,11 +176,12 @@ __global__ void endpgm() {
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
@@ -198,9 +203,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
@@ -221,11 +227,12 @@ __device__ void func(float *x);
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(5) [[SHARED]], align 8
-// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr addrspace(5) [[SHARED]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
@@ -247,10 +254,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1, addrspace(5)
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(5) [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP0]])
@@ -267,10 +275,11 @@ __global__ void test_is_shared(float *x){
// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1, addrspace(5)
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(5) [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP0]])
diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 94338f9027db1..7f48a8608af1d 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -14,10 +14,11 @@
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
@@ -32,10 +33,11 @@
// AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
@@ -55,10 +57,11 @@ __global__ void use_dispatch_ptr(int* out) {
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
@@ -73,10 +76,11 @@ __global__ void use_dispatch_ptr(int* out) {
// AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
@@ -96,10 +100,11 @@ __global__ void use_queue_ptr(int* out) {
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
@@ -114,10 +119,11 @@ __global__ void use_queue_ptr(int* out) {
// AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
@@ -198,11 +204,12 @@ __global__ void test_ds_fadd(float src) {
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4
// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[X:%.*]] = alloca float, align 4
+// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8
-// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
@@ -218,11 +225,12 @@ __global__ void test_ds_fadd(float src) {
// AMDGCNSPIRV-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4
// AMDGCNSPIRV-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8
-// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
@@ -265,11 +273,12 @@ __global__ void endpgm() {
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
@@ -286,11 +295,12 @@ __global__ void endpgm() {
// AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
// AMDGCNSPIRV-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
@@ -312,9 +322,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META5:![0-9]+]])
// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -330,9 +341,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
// AMDGCNSPIRV-NEXT: entry:
// AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8
-// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META7:![0-9]+]])
// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -359,11 +371,12 @@ __device__ void func(float *x);
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4
// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[X:%.*]] = alloca float, align 4
+// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8
-// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
@@ -381,11 +394,12 @@ __device__ void func(float *x);
// AMDGCNSPIRV-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4
// AMDGCNSPIRV-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8
-// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
@@ -407,10 +421,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -424,10 +439,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
// AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8
-// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -445,10 +461,11 @@ __global__ void test_is_shared(float *x){
// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1
+// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8
-// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
@@ -462,10 +479,11 @@ __global__ void test_is_shared(float *x){
// AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
// AMDGCNSPIRV-NEXT: [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
// AMDGCNSPIRV-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
-// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8
-// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8
+// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
diff --git a/clang/test/CodeGenCUDA/record-layout.cu b/clang/test/CodeGenCUDA/record-layout.cu
index 12ce64f2be56d..847a81d88d280 100644
--- a/clang/test/CodeGenCUDA/record-layout.cu
+++ b/clang/test/CodeGenCUDA/record-layout.cu
@@ -65,9 +65,10 @@ struct J : I {
};
// DEV: define dso_local amdgpu_kernel void @_Z8C_kernel1C(ptr addrspace(4) noundef byref(%struct.C) align 4 %0)
-// DEV: %c = alloca %struct.C, align 4, addrspace(5)
-// DEV: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %c, ptr addrspace(4) align 4 %0, i64 8, i1 false)
-// DEV: %i = getelementptr inbounds nuw %struct.C, ptr %c.ascast, i32 0, i32 1
+// DEV: %coerce = alloca %struct.C, align 4, addrspace(5)
+// DEV: %c = addrspacecast ptr addrspace(5) %coerce to ptr
+// DEV: call void @llvm.memcpy.p0.p4.i64(ptr align 4 %c, ptr addrspace(4) align 4 %0, i64 8, i1 false)
+// DEV: %i = getelementptr inbounds nuw %struct.C, ptr %c, i32 0, i32 1
// DEV: store i32 1, ptr %i, align 4
__global__ void C_kernel(C c)
diff --git a/clang/test/CodeGenCXX/amdgcn-func-arg.cpp b/clang/test/CodeGenCXX/amdgcn-func-arg.cpp
index 3304be8eddade..3cc5dd7828464 100644
--- a/clang/test/CodeGenCXX/amdgcn-func-arg.cpp
+++ b/clang/test/CodeGenCXX/amdgcn-func-arg.cpp
@@ -1,30 +1,10 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -O0 -triple amdgcn -emit-llvm %s -o - | FileCheck %s
class A {
public:
int x;
-// CHECK-LABEL: define linkonce_odr void @_ZN1AC1Ev(
-// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr
-// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call void @_ZN1AC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT: ret void
-//
A():x(0) {}
-// CHECK-LABEL: define linkonce_odr void @_ZN1AD1Ev(
-// CHECK-SAME: ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr
-// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT: call void @_ZN1AD2Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[THIS1]]) #[[ATTR6:[0-9]+]]
-// CHECK-NEXT: ret void
-//
~A() {}
};
@@ -38,13 +18,13 @@ B g_b;
void func_with_ref_arg(A &a);
void func_with_ref_arg(B &b);
-// CHECK-LABEL: define dso_local void @_Z22func_with_indirect_arg1A(
-// CHECK-SAME: ptr addrspace(5) noundef [[A:%.*]]) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-LABEL: @_Z22func_with_indirect_arg1A(
+// CHECK-NEXT: entry:
// CHECK-NEXT: [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[A_INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_INDIRECT_ADDR]] to ptr
// CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
-// CHECK-NEXT: store ptr addrspace(5) [[A]], ptr addrspace(5) [[A_INDIRECT_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(5) [[A:%.*]], ptr [[A_INDIRECT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
// CHECK-NEXT: store ptr [[A_ASCAST]], ptr [[P_ASCAST]], align 8
// CHECK-NEXT: ret void
@@ -53,19 +33,19 @@ void func_with_indirect_arg(A a) {
A *p = &a;
}
-// CHECK-LABEL: define dso_local void @_Z22test_indirect_arg_autov(
-// CHECK-SAME: ) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-LABEL: @_Z22test_indirect_arg_autov(
+// CHECK-NEXT: entry:
// CHECK-NEXT: [[A:%.*]] = alloca [[CLASS_A:%.*]], align 4, addrspace(5)
// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_A]], align 4, addrspace(5)
// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
-// CHECK-NEXT: call void @_ZN1AC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) #[[ATTR5]]
-// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 [[A_ASCAST]], i64 4, i1 false)
-// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP]]) #[[ATTR5]]
// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
-// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) #[[ATTR6]]
-// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) #[[ATTR5]]
-// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[A_ASCAST]]) #[[ATTR6]]
+// CHECK-NEXT: call void @_ZN1AC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]])
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[A_ASCAST]], i64 4, i1 false)
+// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
+// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP_ASCAST_ASCAST]])
+// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]])
+// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]])
+// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[A_ASCAST]])
// CHECK-NEXT: ret void
//
void test_indirect_arg_auto() {
@@ -74,15 +54,15 @@ void test_indirect_arg_auto() {
func_with_ref_arg(a);
}
-// CHECK-LABEL: define dso_local void @_Z24test_indirect_arg_globalv(
-// CHECK-SAME: ) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-LABEL: @_Z24test_indirect_arg_globalv(
+// CHECK-NEXT: entry:
// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_A:%.*]], align 4, addrspace(5)
-// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 addrspacecast (ptr addrspace(1) @g_a to ptr), i64 4, i1 false)
-// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP]]) #[[ATTR5]]
// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
-// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) #[[ATTR6]]
-// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @g_a to ptr)) #[[ATTR5]]
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_a to ptr), i64 4, i1 false)
+// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
+// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP_ASCAST_ASCAST]])
+// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]])
+// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @g_a to ptr))
// CHECK-NEXT: ret void
//
void test_indirect_arg_global() {
@@ -90,30 +70,30 @@ void test_indirect_arg_global() {
func_with_ref_arg(g_a);
}
-// CHECK-LABEL: define dso_local void @_Z19func_with_byval_arg1B(
-// CHECK-SAME: ptr addrspace(5) noundef byref([[CLASS_B:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[B:%.*]] = alloca [[CLASS_B]], align 4, addrspace(5)
+// CHECK-LABEL: @_Z19func_with_byval_arg1B(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[COERCE:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5)
// CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
// CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr
-// CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[B]], ptr addrspace(5) align 4 [[TMP0]], i64 400, i1 false)
-// CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr
-// CHECK-NEXT: store ptr [[B_ASCAST]], ptr [[P_ASCAST]], align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[B]], ptr addrspace(5) align 4 [[TMP0:%.*]], i64 400, i1 false)
+// CHECK-NEXT: store ptr [[B]], ptr [[P_ASCAST]], align 8
// CHECK-NEXT: ret void
//
void func_with_byval_arg(B b) {
B *p = &b;
}
-// CHECK-LABEL: define dso_local void @_Z19test_byval_arg_autov(
-// CHECK-SAME: ) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-LABEL: @_Z19test_byval_arg_autov(
+// CHECK-NEXT: entry:
// CHECK-NEXT: [[B:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5)
// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_B]], align 4, addrspace(5)
// CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr
-// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 [[B_ASCAST]], i64 400, i1 false)
-// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP]]) #[[ATTR5]]
-// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]]) #[[ATTR5]]
+// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[B_ASCAST]], i64 400, i1 false)
+// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
+// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
+// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]])
// CHECK-NEXT: ret void
//
void test_byval_arg_auto() {
@@ -122,13 +102,14 @@ void test_byval_arg_auto() {
func_with_ref_arg(b);
}
-// CHECK-LABEL: define dso_local void @_Z21test_byval_arg_globalv(
-// CHECK-SAME: ) #[[ATTR1]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-LABEL: @_Z21test_byval_arg_globalv(
+// CHECK-NEXT: entry:
// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5)
-// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false)
-// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP]]) #[[ATTR5]]
-// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr)) #[[ATTR5]]
+// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false)
+// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5)
+// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]])
+// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr))
// CHECK-NEXT: ret void
//
void test_byval_arg_global() {
diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
index 38b58203be745..62ea3c991f26c 100644
--- a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -109,9 +109,7 @@ std::initializer_list<int> thread_local x = {1, 2, 3, 4};
// AMDGCN: store ptr addrspacecast ({{[^@]+}} @_ZGR15globalInitList2_ {{[^)]+}}),
// AMDGCN: ptr addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), align 8
// AMDGCN: store i64 2, ptr getelementptr inbounds nuw (i8, ptr addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), i64 8), align 8
-// AMDGCN-NEXT: {{.*}} = addrspacecast ptr addrspace(5) {{.*}} to ptr
// CHECK: call void @_ZN10destroyme1D1Ev
-// AMDGCN-NEXT: {{.*}} = addrspacecast ptr addrspace(5) {{.*}} to ptr
// CHECK-NEXT: call void @_ZN10destroyme1D1Ev
// CHECK-NEXT: ret void
std::initializer_list<witharg1> globalInitList2 = {
@@ -123,6 +121,7 @@ void fn1(int i) {
// temporary array
// X86: [[array:%[^ ]+]] = alloca [3 x i32]
// AMDGCN: [[alloca:%[^ ]+]] = alloca [3 x i32], align 4, addrspace(5)
+ // AMDGCN: [[array:%[^ ]+]] ={{.*}} addrspacecast ptr addrspace(5) [[alloca]] to ptr
// CHECK: store i32 1, ptr
// CHECK-NEXT: getelementptr
// CHECK-NEXT: store
@@ -483,7 +482,7 @@ namespace B19773010 {
// CHECK-LABEL: @_ZN9B197730102f1Ev
testcase a{{"", ENUM_CONSTANT}};
// X86: store ptr @.ref.tmp{{.*}}, ptr %{{.*}}, align 8
- // AMDGCN: store ptr addrspacecast{{.*}} @.ref.tmp{{.*}}{{.*}}, ptr addrspace(5) %{{.*}}, align 8
+ // AMDGCN: store ptr addrspacecast{{.*}} @.ref.tmp{{.*}}{{.*}}, ptr %{{.*}}, align 8
}
void f2() {
// CHECK-LABEL: @_ZN9B197730102f2Ev
diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip b/clang/test/CodeGenHIP/placement-new-addrspace.hip
index eceb9b5bb3dd9..48a401baf9a78 100644
--- a/clang/test/CodeGenHIP/placement-new-addrspace.hip
+++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip
@@ -33,9 +33,10 @@ __attribute__((device)) Big make_big() { return Big(7); }
// CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE]], ptr addrspace(5) [[OUT]], align 8
-// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], align 8
+// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8
// CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_BIG]]) align 4 [[TMP]]) #[[ATTR3]]
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 5c33c5ca8a4f9..32ab1372ae591 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -731,10 +731,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN20-NEXT: [[ENTRY:.*:]]
// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
-// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
// AMDGCN20-NEXT: ret void
@@ -745,9 +746,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN20-NEXT: [[ENTRY:.*:]]
// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
@@ -856,12 +858,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN20-NEXT: [[ENTRY:.*:]]
// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
-// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
// AMDGCN20-NEXT: ret void
@@ -872,9 +875,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN20-NEXT: [[ENTRY:.*:]]
// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index 9dabe64d8b75b..ffeb942b6e0a3 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -205,10 +205,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN-NEXT: [[ENTRY:.*:]]
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
// AMDGCN-NEXT: ret void
@@ -219,9 +220,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN-NEXT: [[ENTRY:.*:]]
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
-// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
@@ -330,12 +332,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN-NEXT: [[ENTRY:.*:]]
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
-// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
// AMDGCN-NEXT: ret void
@@ -346,9 +349,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
// AMDGCN-NEXT: [[ENTRY:.*:]]
// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr
// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
-// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
-// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8
// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 87de7a1087411..6b94d5b868cec 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -129,6 +129,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
// NOCPU-NEXT: [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
// NOCPU-NEXT: [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
+// NOCPU-NEXT: [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
// NOCPU-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// NOCPU-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8
// NOCPU-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1
@@ -479,6 +480,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
// GFX900-NEXT: [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
// GFX900-NEXT: [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
+// GFX900-NEXT: [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
// GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]]
// GFX900-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]]
diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl
index 18a3401f3a339..28d1f572421f6 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops.cl
@@ -344,11 +344,12 @@ int test_volatile(volatile atomic_int *i) {
// CHECK-LABEL: @test_volatile
// CHECK: %[[i_addr:.*]] = alloca ptr
// CHECK-NEXT: %[[atomicdst:.*]] = alloca i32
+ // CHECK-NEXT: %[[atomicdst_ascast:.*]] = addrspacecast ptr addrspace(5) %[[atomicdst]] to ptr
// CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]]
// CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]]
// CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}}
- // CHECK-NEXT: store i32 %[[res]], ptr addrspace(5) %[[atomicdst]]
- // CHECK-NEXT: %[[retval:.*]] = load i32, ptr addrspace(5) %[[atomicdst]]
+ // CHECK-NEXT: store i32 %[[res]], ptr %[[atomicdst_ascast]]
+ // CHECK-NEXT: %[[retval:.*]] = load i32, ptr %[[atomicdst_ascast]]
// CHECK-NEXT: ret i32 %[[retval]]
return __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
}
diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
index 270671284c3d1..528d27f85e54b 100644
--- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
+++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp
@@ -387,11 +387,11 @@ int main() {
// CHECK-AMDGCN-NEXT: define dso_local amdgpu_kernel void @_ZTS26single_purpose_kernel_name
// CHECK-AMDGCN-SAME: (ptr addrspace(4) noundef byref(%struct.single_purpose_kernel) align 1 %0) #[[AMDGCN_ATTR0:[0-9]+]] {
// CHECK-AMDGCN-NEXT: entry:
-// CHECK-AMDGCN-NEXT: %kernelFunc = alloca %struct.single_purpose_kernel, align 1, addrspace(5)
-// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false)
-// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr
+// CHECK-AMDGCN-NEXT: %coerce = alloca %struct.single_purpose_kernel, align 1, addrspace(5)
+// CHECK-AMDGCN-NEXT: %kernelFunc = addrspacecast ptr addrspace(5) %coerce to ptr
+// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false)
// CHECK-AMDGCN-NEXT: call void @_ZNK21single_purpose_kernelclEv
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc.ascast) #[[AMDGCN_ATTR1:[0-9]+]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc) #[[AMDGCN_ATTR1:[0-9]+]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
// CHECK-AMDGCN: define linkonce_odr void @_ZNK21single_purpose_kernelclEv
@@ -425,11 +425,11 @@ int main() {
// CHECK-AMDGCN-SAME: (i32 %kernelFunc.coerce) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
// CHECK-AMDGCN-NEXT: %kernelFunc = alloca %class.anon, align 4, addrspace(5)
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %kernelFunc, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %kernelFunc.coerce, ptr addrspace(5) %coerce.dive, align 4
-// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr
+// CHECK-AMDGCN-NEXT: %kernelFunc1 = addrspacecast ptr addrspace(5) %kernelFunc to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %kernelFunc1, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %kernelFunc.coerce, ptr %coerce.dive, align 4
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %kernelFunc.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %kernelFunc1, i32 noundef 42) #[[AMDGCN_ATTR1]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
// CHECK-AMDGCN: define internal void @_ZZ4mainENKUlT_E_clIiEEDaS_
@@ -462,11 +462,11 @@ int main() {
// CHECK-AMDGCN-NEXT: define dso_local amdgpu_kernel void @"_ZTS6\CE\B4\CF\84\CF\87"
// CHECK-AMDGCN-SAME: (ptr addrspace(4) noundef byref(%class.anon.0) align 1 %0) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
-// CHECK-AMDGCN-NEXT: %kernelFunc = alloca %class.anon.0, align 1, addrspace(5)
-// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false)
-// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr
+// CHECK-AMDGCN-NEXT: %coerce = alloca %class.anon.0, align 1, addrspace(5)
+// CHECK-AMDGCN-NEXT: %kernelFunc = addrspacecast ptr addrspace(5) %coerce to ptr
+// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false)
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUliE_clEi
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc.ascast, i32 noundef 42) #[[AMDGCN_ATTR1:[0-9]+]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc, i32 noundef 42) #[[AMDGCN_ATTR1:[0-9]+]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
// CHECK-AMDGCN: define internal void @_ZZ4mainENKUliE_clEi
@@ -502,18 +502,18 @@ int main() {
// CHECK-AMDGCN-NEXT: %k = alloca %class.anon.1, align 4, addrspace(5)
// CHECK-AMDGCN-NEXT: %a.addr = alloca i32, align 4, addrspace(5)
// CHECK-AMDGCN-NEXT: %b.addr = alloca i32, align 4, addrspace(5)
+// CHECK-AMDGCN-NEXT: %k2 = addrspacecast ptr addrspace(5) %k to ptr
// CHECK-AMDGCN-NEXT: %a.addr.ascast = addrspacecast ptr addrspace(5) %a.addr to ptr
// CHECK-AMDGCN-NEXT: %b.addr.ascast = addrspacecast ptr addrspace(5) %b.addr to ptr
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.1, ptr addrspace(5) %k, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: %coerce.dive1 = getelementptr inbounds nuw %struct.copyable, ptr addrspace(5) %coerce.dive, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %k.coerce, ptr addrspace(5) %coerce.dive1, align 4
-// CHECK-AMDGCN-NEXT: %k.ascast = addrspacecast ptr addrspace(5) %k to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.1, ptr %k2, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: %coerce.dive1 = getelementptr inbounds nuw %struct.copyable, ptr %coerce.dive, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %k.coerce, ptr %coerce.dive1, align 4
// CHECK-AMDGCN-NEXT: store i32 %a, ptr %a.addr.ascast, align 4
// CHECK-AMDGCN-NEXT: store i32 %b, ptr %b.addr.ascast, align 4
// CHECK-AMDGCN-NEXT: %0 = load i32, ptr %a.addr.ascast, align 4
// CHECK-AMDGCN-NEXT: %1 = load i32, ptr %b.addr.ascast, align 4
// CHECK-AMDGCN-NEXT: %call = call noundef i32 @_ZZ4mainENKUliiE_clEii
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %k.ascast, i32 noundef %0, i32 noundef %1) #[[AMDGCN_ATTR1:[0-9]+]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %k2, i32 noundef %0, i32 noundef %1) #[[AMDGCN_ATTR1:[0-9]+]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
//
@@ -539,9 +539,10 @@ int main() {
// CHECK-SPIRNV-NEXT: %k.indirect_addr = alloca ptr addrspace(4), align {{[48]}}
// CHECK-SPIRNV-NEXT: %a.addr = alloca i32, align 4
// CHECK-SPIRNV-NEXT: %b.addr = alloca i32, align 4
+// CHECK-SPIRNV-NEXT: %k.indirect_addr.ascast = addrspacecast ptr %k.indirect_addr to ptr addrspace(4)
// CHECK-SPIRNV-NEXT: %a.addr.ascast = addrspacecast ptr %a.addr to ptr addrspace(4)
// CHECK-SPIRNV-NEXT: %b.addr.ascast = addrspacecast ptr %b.addr to ptr addrspace(4)
-// CHECK-SPIRNV-NEXT: store ptr %k, ptr %k.indirect_addr, align {{[48]}}
+// CHECK-SPIRNV-NEXT: store ptr %k, ptr addrspace(4) %k.indirect_addr.ascast, align {{[48]}}
// CHECK-SPIRNV-NEXT: %k.ascast = addrspacecast ptr %k to ptr addrspace(4)
// CHECK-SPIRNV-NEXT: store i32 %a, ptr addrspace(4) %a.addr.ascast, align 4
// CHECK-SPIRNV-NEXT: store i32 %b, ptr addrspace(4) %b.addr.ascast, align 4
@@ -578,11 +579,11 @@ int main() {
// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5)
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4
-// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
//
@@ -611,11 +612,11 @@ int main() {
// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5)
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4
-// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
//
@@ -644,11 +645,11 @@ int main() {
// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5)
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4
-// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
//
@@ -677,11 +678,11 @@ int main() {
// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] {
// CHECK-AMDGCN-NEXT: entry:
// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon.2, align 4, addrspace(5)
-// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.2, ptr addrspace(5) %ref, i32 0, i32 0
-// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4
-// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr
+// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.2, ptr %ref1, i32 0, i32 0
+// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4
// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E0_clIiEEDaS_
-// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]]
+// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]]
// CHECK-AMDGCN-NEXT: ret void
// CHECK-AMDGCN-NEXT: }
//
diff --git a/clang/test/OpenMP/amdgcn_target_device_vla.cpp b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
index 5064c114c0863..3bdc95fbc1152 100644
--- a/clang/test/OpenMP/amdgcn_target_device_vla.cpp
+++ b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -190,7 +190,9 @@ int main() {
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
@@ -203,12 +205,11 @@ int main() {
// CHECK: user_code.entry:
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_omp_outlined(ptr [[TMP6]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4:[0-9]+]]
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
@@ -250,6 +251,7 @@ int main() {
// CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
// CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// CHECK-NEXT: [[I3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I3]] to ptr
+// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -307,8 +309,8 @@ int main() {
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
// CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(5) [[M_CASTED]], align 4
-// CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8
+// CHECK-NEXT: store i32 [[TMP19]], ptr [[M_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8
// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP16]] to ptr
// CHECK-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 8
@@ -557,7 +559,9 @@ int main() {
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
@@ -570,12 +574,11 @@ int main() {
// CHECK: user_code.entry:
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_omp_outlined(ptr [[TMP6]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]]
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
@@ -916,7 +919,10 @@ int main() {
// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
// CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
+// CHECK-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
@@ -930,15 +936,14 @@ int main() {
// CHECK: user_code.entry:
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4
-// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]]
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]]
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
diff --git a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
index 3f9d2225c7de1..eb0e38b5cf2cd 100644
--- a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
+++ b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
@@ -30,7 +30,9 @@ void write_to_aligned_array(int *a, int N) {
// CHECK-AMD-NEXT: [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
// CHECK-AMD-NEXT: [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr
// CHECK-AMD-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-AMD-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// CHECK-AMD-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-AMD-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// CHECK-AMD-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
@@ -40,13 +42,12 @@ void write_to_aligned_array(int *a, int N) {
// CHECK-AMD: user_code.entry:
// CHECK-AMD-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// CHECK-AMD-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[N_CASTED]], align 4
-// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// CHECK-AMD-NEXT: store i32 [[TMP2]], ptr [[N_CASTED_ASCAST]], align 4
+// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// CHECK-AMD-NEXT: [[TMP4:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// CHECK-AMD-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// CHECK-AMD-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_omp_outlined(ptr [[TMP5]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
+// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-AMD-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
// CHECK-AMD-NEXT: call void @__kmpc_target_deinit()
// CHECK-AMD-NEXT: ret void
// CHECK-AMD: worker.exit:
@@ -86,6 +87,7 @@ void write_to_aligned_array(int *a, int N) {
// CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
// CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// CHECK-AMD-NEXT: [[I3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I3]] to ptr
+// CHECK-AMD-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// CHECK-AMD-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -140,8 +142,8 @@ void write_to_aligned_array(int *a, int N) {
// CHECK-AMD-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
// CHECK-AMD-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
// CHECK-AMD-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// CHECK-AMD-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[N_CASTED]], align 4
-// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// CHECK-AMD-NEXT: store i32 [[TMP17]], ptr [[N_CASTED_ASCAST]], align 4
+// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// CHECK-AMD-NEXT: [[TMP19:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// CHECK-AMD-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP14]] to ptr
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 0c04b3c429d7a..d6a1280cfcc26 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -40,7 +40,9 @@ int foo() {
// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
// IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
// IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
@@ -51,12 +53,11 @@ int foo() {
// IR-GPU: user_code.entry:
// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[J_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[TMP5]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// IR-GPU-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
// IR-GPU-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NEXT: ret void
// IR-GPU: worker.exit:
@@ -100,6 +101,7 @@ int foo() {
// IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
+// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -149,8 +151,8 @@ int foo() {
// IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
// IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[J_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
// IR-GPU-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp
index 3686ca7ea08e0..f484dd140bc6c 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp
@@ -51,7 +51,9 @@ int main()
// IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr
// IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
@@ -68,12 +70,11 @@ int main()
// IR-GPU: user_code.entry:
// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
+// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
// IR-GPU-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NEXT: ret void
// IR-GPU: worker.exit:
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp
index 11ae386739b40..3a3220a170e93 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp
@@ -89,7 +89,9 @@ int main()
// IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr
// IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
@@ -106,12 +108,11 @@ int main()
// IR-GPU: user_code.entry:
// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
+// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
// IR-GPU-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NEXT: ret void
// IR-GPU: worker.exit:
@@ -157,6 +158,7 @@ int main()
// IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -218,8 +220,8 @@ int main()
// IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
// IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP21]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to ptr
// IR-GPU-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8
@@ -423,7 +425,9 @@ int main()
// IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr
// IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
@@ -440,12 +444,11 @@ int main()
// IR-GPU: user_code.entry:
// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
+// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
// IR-GPU-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NEXT: ret void
// IR-GPU: worker.exit:
@@ -499,6 +502,7 @@ int main()
// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// IR-GPU-NEXT: [[I11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I11]] to ptr
// IR-GPU-NEXT: [[J12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J12]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -572,8 +576,8 @@ int main()
// IR-GPU-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_LB_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_UB_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP22]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP23:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP24:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP20]] to ptr
// IR-GPU-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8
@@ -828,7 +832,10 @@ int main()
// IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr
// IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
+// IR-GPU-NEXT: [[NT_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NT_CASTED]] to ptr
// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[NT]], ptr [[NT_ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
@@ -846,15 +853,14 @@ int main()
// IR-GPU: user_code.entry:
// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[NT_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP8]], ptr addrspace(5) [[NT_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP9:%.*]] = load i64, ptr addrspace(5) [[NT_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP8]], ptr [[NT_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP9:%.*]] = load i64, ptr [[NT_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55_omp_outlined(ptr [[TMP10]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
+// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
// IR-GPU-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NEXT: ret void
// IR-GPU: worker.exit:
@@ -903,6 +909,8 @@ int main()
// IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// IR-GPU-NEXT: [[I5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I5]] to ptr
+// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
+// IR-GPU-NEXT: [[NT_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NT_CASTED]] to ptr
// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -965,11 +973,11 @@ int main()
// IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
// IR-GPU-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
// IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP21]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[NT_ADDR_ASCAST]], align 4
-// IR-GPU-NEXT: store i32 [[TMP23]], ptr addrspace(5) [[NT_CASTED]], align 4
-// IR-GPU-NEXT: [[TMP24:%.*]] = load i64, ptr addrspace(5) [[NT_CASTED]], align 8
+// IR-GPU-NEXT: store i32 [[TMP23]], ptr [[NT_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP24:%.*]] = load i64, ptr [[NT_CASTED_ASCAST]], align 8
// IR-GPU-NEXT: [[TMP25:%.*]] = getelementptr inbounds [8 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP18]] to ptr
// IR-GPU-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8
@@ -2954,7 +2962,9 @@ int main()
// IR-GPU-NESTED-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr
// IR-GPU-NESTED-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// IR-GPU-NESTED-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// IR-GPU-NESTED-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NESTED-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NESTED-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// IR-GPU-NESTED-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
@@ -2971,12 +2981,11 @@ int main()
// IR-GPU-NESTED: user_code.entry:
// IR-GPU-NESTED-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// IR-GPU-NESTED-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NESTED-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NESTED-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NESTED-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NESTED-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
-// IR-GPU-NESTED-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4
-// IR-GPU-NESTED-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
-// IR-GPU-NESTED-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
+// IR-GPU-NESTED-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NESTED-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
// IR-GPU-NESTED-NEXT: call void @__kmpc_target_deinit()
// IR-GPU-NESTED-NEXT: ret void
// IR-GPU-NESTED: worker.exit:
@@ -3030,6 +3039,7 @@ int main()
// IR-GPU-NESTED-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
// IR-GPU-NESTED-NEXT: [[I11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I11]] to ptr
// IR-GPU-NESTED-NEXT: [[J12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J12]] to ptr
+// IR-GPU-NESTED-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
// IR-GPU-NESTED-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
// IR-GPU-NESTED-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
@@ -3103,8 +3113,8 @@ int main()
// IR-GPU-NESTED-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_LB_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_UB_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
-// IR-GPU-NESTED-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[N_CASTED]], align 4
-// IR-GPU-NESTED-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8
+// IR-GPU-NESTED-NEXT: store i32 [[TMP22]], ptr [[N_CASTED_ASCAST]], align 4
+// IR-GPU-NESTED-NEXT: [[TMP23:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// IR-GPU-NESTED-NEXT: [[TMP24:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
// IR-GPU-NESTED-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP20]] to ptr
// IR-GPU-NESTED-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8
More information about the llvm-branch-commits
mailing list