[Mlir-commits] [llvm] [mlir] [clang] [OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder (PR #80343)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Feb 1 13:34:21 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Akash Banerjee (TIFitis)
<details>
<summary>Changes</summary>
This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions.
Co-authored-by: Jan Leyonberg <jan.leyonberg@<!-- -->amd.com>
---
Patch is 267.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80343.diff
8 Files Affected:
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+71-1303)
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (-3)
- (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp (+273-273)
- (modified) clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+6-6)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+640-32)
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+1420-66)
- (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+18-4)
``````````diff
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db..7cddf73306f2d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
};
} // anonymous namespace
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
- unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
- "nvptx_lane_id");
-}
-
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
@@ -1429,1132 +1404,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
TBAAAccessInfo());
}
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
- llvm::Value *Elem,
- QualType ElemType,
- llvm::Value *Offset,
- SourceLocation Loc) {
- CodeGenModule &CGM = CGF.CGM;
- CGBuilderTy &Bld = CGF.Builder;
- CGOpenMPRuntimeGPU &RT =
- *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
- llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- assert(Size.getQuantity() <= 8 &&
- "Unsupported bitwidth in shuffle instruction.");
-
- RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
- ? OMPRTL___kmpc_shuffle_int32
- : OMPRTL___kmpc_shuffle_int64;
-
- // Cast all types to 32- or 64-bit values before calling shuffle routines.
- QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
- Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
- llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
- llvm::Value *WarpSize =
- Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
- llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
- {ElemCast, Offset, WarpSize});
-
- return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
- Address DestAddr, QualType ElemType,
- llvm::Value *Offset, SourceLocation Loc) {
- CGBuilderTy &Bld = CGF.Builder;
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- // Create the loop over the big sized data.
- // ptr = (void*)Elem;
- // ptrEnd = (void*) Elem + 1;
- // Step = 8;
- // while (ptr + Step < ptrEnd)
- // shuffle((int64_t)*ptr);
- // Step = 4;
- // while (ptr + Step < ptrEnd)
- // shuffle((int32_t)*ptr);
- // ...
- Address ElemPtr = DestAddr;
- Address Ptr = SrcAddr;
- Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
- if (Size < CharUnits::fromQuantity(IntSize))
- continue;
- QualType IntType = CGF.getContext().getIntTypeForBitwidth(
- CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
- /*Signed=*/1);
- llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
- Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
- IntTy);
- ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtr, IntTy->getPointerTo(), IntTy);
- if (Size.getQuantity() / IntSize > 1) {
- llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
- llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
- CGF.EmitBlock(PreCondBB);
- llvm::PHINode *PhiSrc =
- Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
- PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
- llvm::PHINode *PhiDest =
- Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
- PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
- Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
- ElemPtr =
- Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
- llvm::Value *PtrDiff = Bld.CreatePtrDiff(
- CGF.Int8Ty, PtrEnd.getPointer(),
- Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
- CGF.VoidPtrTy));
- Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
- ThenBB, ExitBB);
- CGF.EmitBlock(ThenBB);
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
- Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
- PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
- CGF.EmitBranch(PreCondBB);
- CGF.EmitBlock(ExitBB);
- } else {
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Ptr = Bld.CreateConstGEP(Ptr, 1);
- ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- }
- Size = Size % IntSize;
- }
-}
-
-namespace {
-enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
- CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
- ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
- CodeGenModule &CGM = CGF.CGM;
- ASTContext &C = CGM.getContext();
- CGBuilderTy &Bld = CGF.Builder;
-
- llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
- // Iterates, element-by-element, through the source Reduce list and
- // make a copy.
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- Address SrcElementAddr = Address::invalid();
- Address DestElementAddr = Address::invalid();
- Address DestElementPtrAddr = Address::invalid();
- // Should we shuffle in an element from a remote lane?
- bool ShuffleInElement = false;
- // Set to true to update the pointer in the dest Reduce list to a
- // newly created element.
- bool UpdateDestListPtr = false;
- QualType PrivatePtrType = C.getPointerType(Private->getType());
- llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
- switch (Action) {
- case RemoteLaneToThread: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- ShuffleInElement = true;
- UpdateDestListPtr = true;
- break;
- }
- case ThreadCopy: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Get the address for dest element. The destination
- // element has already been created on the thread's stack.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr = CGF.EmitLoadOfPointer(
- DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
- break;
- }
- }
-
- // Regardless of src and dest of copy, we emit the load of src
- // element as this is required in all directions
- SrcElementAddr = SrcElementAddr.withElementType(
- CGF.ConvertTypeForMem(Private->getType()));
- DestElementAddr =
- DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
- // Now that all active lanes have read the element in the
- // Reduce list, shuffle over the value from the remote lane.
- if (ShuffleInElement) {
- shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
- } else {
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- SrcElementAddr, /*Volatile=*/false, Private->getType(),
- Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(
- Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getExprLoc());
- CGF.EmitStoreOfComplex(
- Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- }
-
- // Step 3.1: Modify reference in dest Reduce list as needed.
- // Modifying the reference in Reduce list to point to the newly
- // created element. The element is live in the current function
- // scope and that of functions it invokes (i.e., reduce_function).
- // RemoteReduceData[i] = (void*)&RemoteElem
- if (UpdateDestListPtr) {
- CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr.getPointer(), CGF.VoidPtrTy),
- DestElementPtrAddr, /*Volatile=*/false,
- C.VoidPtrTy);
- }
-
- ++Idx;
- }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-/// shared smem[warp_size];
-/// For all data entries D in reduce_data:
-/// sync
-/// If (I am the first lane in each warp)
-/// Copy my local D to smem[warp_id]
-/// sync
-/// if (I am the first warp)
-/// Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
- ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy,
- SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- llvm::Module &M = CGM.getModule();
-
- // ReduceList: thread local Reduce list.
- // At the stage of the computation when this function is called, partially
- // aggregated values reside in the first lane of every active warp.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // NumWarps: number of warps active in the parallel region. This could
- // be smaller than 32 (max warps in a CTA) for partial block reduction.
- ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.getIntTypeForBitwidth(32, /* Signed */ true),
- ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&NumWarpsArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
- llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_inter_warp_copy_func", &M);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- // This array is used as a medium to transfer, one reduce element at a time,
- // the data from the first lane of every warp to lanes in the first warp
- // in order to perform the final step of a reduction in a parallel region
- // (reduction across warps). The array is placed in NVPTX __shared__ memory
- // for reduced latency, as well as to have a distinct copy for concurrently
- // executing target regions. The array is declared with common linkage so
- // as to be shared across compilation units.
- StringRef TransferMediumName =
- "__openmp_nvptx_data_transfer_temporary_storage";
- llvm::GlobalVariable *TransferMedium =
- M.getGlobalVariable(TransferMediumName);
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
- llvm::UndefValue::get(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
- CGM.addCompilerUsedGlobal(TransferMedium);
- }
-
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // nvptx_lane_id = nvptx_id % warpsize
- llvm::Value *LaneID = getNVPTXLaneID(CGF);
- // nvptx_warp_id = nvptx_id / warpsize
- llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- //
- // Warp master copies reduce element to transfer medium in __shared__
- // memory.
- //
- unsigned RealTySize =
- C.getTypeSizeInChars(Private->getType())
- .alignTo(C.getTypeAlignInChars(Private->getType()))
- .getQuantity();
- for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
- unsigned NumIters = RealTySize / TySize;
- if (NumIters == 0)
- continue;
- QualType CType = C.getIntTypeForBitwidth(
- C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
- llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
- CharUnits Align = CharUnits::fromQuantity(TySize);
- llvm::Value *Cnt = nullptr;
- Address CntAddr = Address::invalid();
- llvm::BasicBlock *PrecondBB = nullptr;
- llvm::BasicBlock *ExitBB = nullptr;
- if (NumIters > 1) {
- CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
- /*Volatile=*/false, C.IntTy);
- PrecondBB = CGF.createBasicBlock("precond");
- ExitBB = CGF.createBasicBlock("exit");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(PrecondBB);
- Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
- llvm::Value *Cmp =
- Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
- Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
- CGF.EmitBlock(BodyBB);
- }
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- Address ElemPtr(ElemPtrPtr, CopyType, Align);
- if (NumIters > 1)
- ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferM...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/80343
More information about the Mlir-commits
mailing list