[llvm-branch-commits] [clang] [llvm] [mlir] [offload] Fix teams/threads limits in record replay (PR #200639)
Kevin Sala Penades via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 18 23:53:48 PDT 2026
https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/200639
>From 24d6172f0c00d62287c211c9f0b84e4b6f6063d9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 18 Jun 2026 22:41:45 -0700
Subject: [PATCH 01/10] [MC] Remove unused FileNo parameter from
MCStreamer::checkCVLocSection. NFC (#204729)
---
llvm/include/llvm/MC/MCStreamer.h | 2 +-
llvm/lib/MC/MCAsmStreamer.cpp | 2 +-
llvm/lib/MC/MCObjectStreamer.cpp | 2 +-
llvm/lib/MC/MCStreamer.cpp | 3 +--
4 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 955a48705f15c..3b25cc5440c52 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -294,7 +294,7 @@ class LLVM_ABI MCStreamer {
virtual void emitRawTextImpl(StringRef String);
/// Returns true if the .cv_loc directive is in the right section.
- bool checkCVLocSection(unsigned FuncId, unsigned FileNo, SMLoc Loc);
+ bool checkCVLocSection(unsigned FuncId, SMLoc Loc);
std::unique_ptr<MCLFIRewriter> LFIRewriter;
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 68929ddc135f3..0344d1c0ead03 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -1878,7 +1878,7 @@ void MCAsmStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
bool PrologueEnd, bool IsStmt,
StringRef FileName, SMLoc Loc) {
// Validate the directive.
- if (!checkCVLocSection(FunctionId, FileNo, Loc))
+ if (!checkCVLocSection(FunctionId, Loc))
return;
OS << "\t.cv_loc\t" << FunctionId << " " << FileNo << " " << Line << " "
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 2bf5f05c1c315..81410b37069fb 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -615,7 +615,7 @@ void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
bool PrologueEnd, bool IsStmt,
StringRef FileName, SMLoc Loc) {
// Validate the directive.
- if (!checkCVLocSection(FunctionId, FileNo, Loc))
+ if (!checkCVLocSection(FunctionId, Loc))
return;
// Emit a label at the current position and record it in the CodeViewContext.
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index d6542ef24a9e4..1f0d915fae8ab 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -300,8 +300,7 @@ void MCStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
bool PrologueEnd, bool IsStmt,
StringRef FileName, SMLoc Loc) {}
-bool MCStreamer::checkCVLocSection(unsigned FuncId, unsigned FileNo,
- SMLoc Loc) {
+bool MCStreamer::checkCVLocSection(unsigned FuncId, SMLoc Loc) {
CodeViewContext &CVC = getContext().getCVContext();
MCCVFunctionInfo *FI = CVC.getCVFunctionInfo(FuncId);
if (!FI) {
>From ed287c064db043a94b5470ebf36524153aeaf33f Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas at microsoft.com>
Date: Thu, 18 Jun 2026 22:52:28 -0700
Subject: [PATCH 02/10] [HLSL][NFC] Add codegen test for ConstantBuffer<T>
element access and initialization (#204507)
Adding tests coverage for `ConstantBuffer<T>` element access and
initialization.
Related to llvm/wg-hlsl#302 and llvm/llvm-project#195153
---
.../resources/ConstantBufferT.hlsl | 142 ++++++++++++++++++
1 file changed, 142 insertions(+)
create mode 100644 clang/test/CodeGenHLSL/resources/ConstantBufferT.hlsl
diff --git a/clang/test/CodeGenHLSL/resources/ConstantBufferT.hlsl b/clang/test/CodeGenHLSL/resources/ConstantBufferT.hlsl
new file mode 100644
index 0000000000000..30c2c9bb5c2f6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/ConstantBufferT.hlsl
@@ -0,0 +1,142 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN: llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DCONST_ADDR_SPACE=2 -DPADDING_TYPE="dx.Padding"
+// RUN: %clang_cc1 -triple spirv-vulkan-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN: llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPV -DCONST_ADDR_SPACE=12 -DPADDING_TYPE="spirv.Padding"
+
+struct S {
+ float3 f3;
+ int a;
+};
+
+struct MyConstants {
+ float f;
+ int2 i2;
+ half3 h3;
+ double d;
+ int array[2];
+ float2x2 m;
+ S s;
+};
+
+ConstantBuffer<MyConstants> CB;
+ConstantBuffer<S> CBArray[2];
+
+// CHECK-DXIL: %"class.hlsl::ConstantBuffer" = type { target("dx.CBuffer", %MyConstants) }
+// CHECK-SPV: %"class.hlsl::ConstantBuffer" = type { target("spirv.VulkanBuffer", %MyConstants, 2, 0) }
+
+// CHECK: %MyConstants = type <{ float, <2 x i32>, target("[[PADDING_TYPE]]", 4), <3 x float>,
+// CHECK-SAME: target("[[PADDING_TYPE]]", 4), double, target("[[PADDING_TYPE]]", 8),
+// CHECK-SAME: <{ [1 x <{ i32, target("[[PADDING_TYPE]]", 12) }>], i32 }>,
+// CHECK-SAME: target("[[PADDING_TYPE]]", 12), <{ [1 x <{ <2 x float>, target("[[PADDING_TYPE]]", 8) }>],
+// CHECK-SAME: <2 x float> }>, target("[[PADDING_TYPE]]", 8), %S }>
+
+// CHECK: %S = type <{ <3 x float>, i32 }>
+// CHECK: %struct.S = type { <3 x float>, i32 }
+
+// CHECK: @CB = internal global %"class.hlsl::ConstantBuffer" poison, align {{(4|8)}}
+// CHECK: [[CBStr:.*]] = private unnamed_addr constant [3 x i8] c"CB\00", align 1
+// CHECK: [[CBArrayStr:.*]] = private unnamed_addr constant [8 x i8] c"CBArray\00", align 1
+
+// CB initialization
+//
+// CHECK-LABEL: __cxx_global_var_init
+// CHECK: call void @hlsl::ConstantBuffer<MyConstants>::__createFromImplicitBinding({{[^)]+}})
+// CHECK-SAME: (ptr dead_on_unwind writable sret(%"class.hlsl::ConstantBuffer") align {{(4|8)}} @CB,
+// CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef [[CBStr]])
+
+// CHECK: define linkonce_odr hidden void @hlsl::ConstantBuffer<MyConstants>::__createFromImplicitBinding(
+// CHECK-DXIL: call target("dx.CBuffer", %MyConstants) @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_s_MyConstantsst
+// CHECK-SPV: call target("spirv.VulkanBuffer", %MyConstants, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_s_MyConstantss_2_0t(
+
+// CHECK-LABEL: TestElementAccess
+void TestElementAccess() {
+// CHECK: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_F_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 0
+// CHECK-NEXT: [[CB_F:%.*]] = load float, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_F_PTR]], align 4
+// CHECK-NEXT: store float [[CB_F]], ptr %f, align 4
+ float f = CB.f;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_I2_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 1
+// CHECK-NEXT: [[CB_I2:%.*]] = load <2 x i32>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_I2_PTR]], align 4
+// CHECK-NEXT: store <2 x i32> [[CB_I2]], ptr %i2, align 4
+ int2 i2 = CB.i2;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_H3_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 3
+// CHECK-NEXT: [[CB_H3:%.*]] = load <3 x float>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_H3_PTR]], align 4
+// CHECK-NEXT: store <3 x float> [[CB_H3]], ptr %h3, align 4
+ half3 h3 = CB.h3;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_D_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 5
+// CHECK-NEXT: [[CB_D:%.*]] = load double, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_D_PTR]], align 8
+// CHECK-NEXT: store double [[CB_D]], ptr %d, align 8
+ double d = CB.d;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_ARRAY_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 7
+// CHECK-NEXT: [[CB_ARRAY_DECAY_PTR:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_ARRAY_PTR]], {{(i32|i64)}} 0, {{(i32|i64)}} 0
+// CHECK-NEXT: [[CB_ARRAY_1_PTR:%.*]] = getelementptr <{ i32, target("[[PADDING_TYPE]]", 12) }>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_ARRAY_DECAY_PTR]], {{(i32|i64)}} 1, {{(i32|i64)}} 0
+// CHECK-NEXT: [[CB_ARRAY_1:%.*]] = load i32, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_ARRAY_1_PTR]], align 16
+// CHECK-NEXT: store i32 [[CB_ARRAY_1]], ptr %arrayEl, align 4
+ int arrayEl = CB.array[1];
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_M_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 9
+// CHECK-NEXT: [[CB_M:%.*]] = load <4 x float>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_M_PTR]], align 4
+// CHECK-NEXT: store <4 x float> [[CB_M]], ptr %m, align 4
+ float2x2 m = CB.m;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_S_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 11
+// CHECK-NEXT: [[CB_S_F3_PTR:%.*]] = getelementptr inbounds %S, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_PTR]], i32 0, i32 0
+// CHECK-NEXT: [[S_F3_PTR:%.*]] = getelementptr inbounds %struct.S, ptr %s, i32 0, i32 0
+// CHECK-NEXT: [[CB_S_F3:%.*]] = load <3 x float>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_F3_PTR]], align 4
+// CHECK-NEXT: store <3 x float> [[CB_S_F3]], ptr [[S_F3_PTR]], align 4
+// CHECK-NEXT: [[CB_S_A_PTR:%.*]] = getelementptr inbounds %S, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_PTR]], i32 0, i32 1
+// CHECK-NEXT: [[S_A_PTR:%.*]] = getelementptr inbounds %struct.S, ptr %s, i32 0, i32 1
+// CHECK-NEXT: [[CB_S_A:%.*]] = load i32, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_A_PTR]], align 4
+// CHECK-NEXT: store i32 [[CB_S_A]], ptr [[S_A_PTR]], align 4
+ S s = CB.s;
+
+// CHECK-NEXT: [[CB_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}} @CB)
+// CHECK-NEXT: [[CB_S_PTR:%.*]] = getelementptr inbounds nuw %MyConstants, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_PTR]], i32 0, i32 11
+// CHECK-NEXT: [[CB_S_F3_PTR:%.*]] = getelementptr inbounds nuw %S, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_PTR]], i32 0, i32 0
+// CHECK-NEXT: [[CB_S_F:%.*]] = load <3 x float>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_S_F3_PTR]], align 4
+// CHECK-NEXT: store <3 x float> [[CB_S_F]], ptr %f3, align 4
+ float3 f3 = CB.s.f3;
+}
+
+// CHECK: define {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<MyConstants>::operator MyConstants const AS[[CONST_ADDR_SPACE]]&() const(ptr {{.*}})
+// CHECK: [[HANDLE_PTR:%.*]] = getelementptr inbounds nuw %"class.hlsl::ConstantBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-DXIL: [[HANDLE:%.*]] = load target("dx.CBuffer", %MyConstants), ptr [[HANDLE_PTR]], align 4
+// CHECK-DXIL: [[BASE_PTR:%.*]] = call ptr addrspace([[CONST_ADDR_SPACE]]) @llvm.dx.resource.getbasepointer.p2.tdx.CBuffer_s_MyConstantsst(target("dx.CBuffer", %MyConstants) [[HANDLE]])
+// CHECK-SPV: [[HANDLE:%.*]] = load target("spirv.VulkanBuffer", %MyConstants, 2, 0), ptr [[HANDLE_PTR]], align 8
+// CHECK-SPV: [[BASE_PTR:%.*]] = call ptr addrspace([[CONST_ADDR_SPACE]]) @llvm.spv.resource.getbasepointer.p12.tspirv.VulkanBuffer_s_MyConstantss_2_0t(target("spirv.VulkanBuffer", %MyConstants, 2, 0) [[HANDLE]])
+// CHECK: ret ptr addrspace([[CONST_ADDR_SPACE]]) [[BASE_PTR]]
+
+// CHECK-LABEL: TestArrayAccess
+void TestArrayAccess() {
+// CHECK: [[TMP0:%.*]] = alloca %"class.hlsl::ConstantBuffer.0", align {{(4|8)}}
+// CHECK: [[TMP1:%.*]] = alloca %"class.hlsl::ConstantBuffer.0", align {{(4|8)}}
+
+// CHECK: call void @hlsl::ConstantBuffer<S>::__createFromImplicitBinding({{.*}})(ptr {{.*}} sret(%"class.hlsl::ConstantBuffer.0") align {{(4|8)}} [[TMP0]],
+// CHECK-SAME: i32 noundef 1, i32 noundef 0, i32 noundef 2, i32 noundef 1, ptr noundef [[CBArrayStr]])
+// CHECK-NEXT: [[CB_1_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<S>::operator S const AS[[CONST_ADDR_SPACE]]&() const(ptr{{.*}} [[TMP0]])
+// CHECK-NEXT: [[CB_1_F3_PTR:%.*]] = getelementptr inbounds nuw %S, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_1_PTR]], i32 0, i32 0
+// CHECK-NEXT: [[CB_1_F3:%.*]] = load <3 x float>, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_1_F3_PTR]], align 4
+// CHECK-NEXT: store <3 x float> [[CB_1_F3]], ptr %f3, align 4
+ float3 f3 = CBArray[1].f3;
+
+// CHECK: call void @hlsl::ConstantBuffer<S>::__createFromImplicitBinding({{.*}})(ptr {{.*}} sret(%"class.hlsl::ConstantBuffer.0") align {{(4|8)}} [[TMP1]],
+// CHECK-SAME: i32 noundef 1, i32 noundef 0, i32 noundef 2, i32 noundef 0, ptr noundef [[CBArrayStr]])
+// CHECK-NEXT: [[CB_0_PTR:%.*]] = call {{.*}} ptr addrspace([[CONST_ADDR_SPACE]]) @hlsl::ConstantBuffer<S>::operator S const AS[[CONST_ADDR_SPACE]]&() const(ptr{{.*}} [[TMP1]])
+// CHECK-NEXT: [[CB_0_A_PTR:%.*]] = getelementptr inbounds nuw %S, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_0_PTR]], i32 0, i32 1
+// CHECK-NEXT: [[CB_0_A:%.*]] = load i32, ptr addrspace([[CONST_ADDR_SPACE]]) [[CB_0_A_PTR]], align 4
+// CHECK-NEXT: store i32 [[CB_0_A]], ptr %a, align 4
+ int a = CBArray[0].a;
+}
+
+// CHECK-DXIL: declare ptr addrspace([[CONST_ADDR_SPACE]]) @llvm.dx.resource.getbasepointer.p2.tdx.CBuffer_s_MyConstantsst(target("dx.CBuffer", %MyConstants))
+// CHECK-SPV: declare ptr addrspace([[CONST_ADDR_SPACE]]) @llvm.spv.resource.getbasepointer.p12.tspirv.VulkanBuffer_s_MyConstantss_2_0t(target("spirv.VulkanBuffer", %MyConstants, 2, 0))
>From 21622397c16f65d00361cd883c769189b06df13c Mon Sep 17 00:00:00 2001
From: Sairudra More <sairudra60 at gmail.com>
Date: Fri, 19 Jun 2026 11:45:05 +0530
Subject: [PATCH 03/10] [mlir][OpenMP] Translate explicit task in_reduction
(#202611)
Lower `in_reduction` on explicit `omp.task` operations to LLVM IR.
Inside the outlined task body, obtain the executing thread's `gtid` and call `__kmpc_task_reduction_get_th_data` with a null descriptor and the original reduction variable address. This lets the runtime find the enclosing taskgroup reduction registration and return the per-task private reduction storage.
The `in_reduction` block arguments are remapped to the returned private storage, so updates inside the explicit task body target the task-private reduction copy instead of the original shared variable.
Unsupported cases remain guarded, including byref `in_reduction` and richer `declare_reduction` forms such as two-argument initializers, cleanup regions, and missing combiners.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 68 ++++-
.../LLVMIR/openmp-task-in-reduction.mlir | 233 ++++++++++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 6 +-
3 files changed, 303 insertions(+), 4 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6ac01ac3f301e..703f72d1ab5bc 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -470,7 +470,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
})
.Case([&](omp::TaskOp op) {
checkAllocate(op, result);
- checkInReduction(op, result);
+ checkInReductionByref(op, result);
})
.Case([&](omp::TaskgroupOp op) {
checkAllocate(op, result);
@@ -3106,6 +3106,22 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
if (failed(buildAffinityData(taskOp, builder, moduleTranslation, ad)))
return llvm::failure();
+ // Resolve and validate in_reduction declarations. Byref in_reduction has
+ // already been rejected by checkImplementationStatus; the helper rejects the
+ // remaining richer declare_reduction shapes (two-argument initializer,
+ // cleanup region, missing combiner). This is pure MLIR symbol-table work and
+ // emits no IR. The matching task_reduction descriptor is registered by an
+ // enclosing taskgroup; here we only look the per-task storage up at runtime.
+ SmallVector<omp::DeclareReductionOp> inRedDecls;
+ if (failed(collectAndValidateTaskloopRedDecls(
+ taskOp.getOperation(), taskOp.getInReductionSyms(), "omp.task",
+ "in_reduction", inRedDecls)))
+ return failure();
+ SmallVector<llvm::Value *> inRedOrigPtrs;
+ inRedOrigPtrs.reserve(inRedDecls.size());
+ for (Value v : taskOp.getInReductionVars())
+ inRedOrigPtrs.push_back(moduleTranslation.lookupValue(v));
+
// Set up for call to createTask()
builder.SetInsertPoint(taskStartBlock);
@@ -3175,6 +3191,56 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
moduleTranslation.mapValue(blockArg, llvmPrivateVar);
}
+ // Map in_reduction block arguments to the per-task private storage returned
+ // by __kmpc_task_reduction_get_th_data. This call must be emitted inside
+ // the to-be-outlined task body so that it returns the *executing* thread's
+ // gtid (not the encountering thread's). The descriptor is NULL: the runtime
+ // walks up enclosing taskgroups to find the matching task_reduction
+ // registration for `origPtr`. The original pointers are auto-captured into
+ // the task shareds aggregate by CodeExtractor during
+ // OpenMPIRBuilder::finalize.
+ if (!inRedDecls.empty()) {
+ auto iface = cast<omp::BlockArgOpenMPOpInterface>(taskOp.getOperation());
+ llvm::OpenMPIRBuilder &ompB = *moduleTranslation.getOpenMPBuilder();
+ llvm::Module *m = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &llvmCtx = m->getContext();
+ llvm::OpenMPIRBuilder::LocationDescription bodyLoc(builder);
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr =
+ ompB.getOrCreateSrcLocStr(bodyLoc, srcLocSize);
+ llvm::Value *bodyIdent = ompB.getOrCreateIdent(srcLocStr, srcLocSize);
+ // Align OpenMPIRBuilder's internal IRBuilder with `builder` so the gtid
+ // call lands inside the to-be-outlined task body.
+ ompB.updateToLocation(bodyLoc);
+ llvm::Value *bodyGtid = ompB.getOrCreateThreadID(bodyIdent);
+ llvm::FunctionCallee getThData = ompB.getOrCreateRuntimeFunction(
+ *m, llvm::omp::OMPRTL___kmpc_task_reduction_get_th_data);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(llvmCtx);
+ llvm::Value *nullDesc = llvm::ConstantPointerNull::get(ptrTy);
+ ArrayRef<BlockArgument> inRedBlockArgs = iface.getInReductionBlockArgs();
+ for (auto [blockArg, origPtr] :
+ llvm::zip_equal(inRedBlockArgs, inRedOrigPtrs)) {
+ // __kmpc_task_reduction_get_th_data takes and returns a generic,
+ // default-address-space `ptr`. Normalize a non-default-address-space
+ // original pointer to the generic address space before the call, and
+ // cast the returned private pointer back to the block argument's
+ // address space when it differs (mirrors the taskloop reduction
+ // remapping in convertOmpTaskloopContextOp).
+ llvm::Value *lookupPtr = origPtr;
+ if (auto *origPtrTy =
+ llvm::dyn_cast<llvm::PointerType>(lookupPtr->getType());
+ origPtrTy && origPtrTy->getAddressSpace() != 0)
+ lookupPtr = builder.CreateAddrSpaceCast(lookupPtr, ptrTy);
+ llvm::Value *priv = builder.CreateCall(
+ getThData, {bodyGtid, nullDesc, lookupPtr}, "omp.inred.priv");
+ if (auto *argPtrTy = llvm::dyn_cast<llvm::PointerType>(
+ moduleTranslation.convertType(blockArg.getType()));
+ argPtrTy && argPtrTy->getAddressSpace() != 0)
+ priv = builder.CreateAddrSpaceCast(priv, argPtrTy);
+ moduleTranslation.mapValue(blockArg, priv);
+ }
+ }
+
auto continuationBlockOrError = convertOmpOpRegions(
taskOp.getRegion(), "omp.task.region", builder, moduleTranslation);
if (failed(handleError(continuationBlockOrError, *taskOp)))
diff --git a/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
new file mode 100644
index 0000000000000..5735898d47829
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
@@ -0,0 +1,233 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// in_reduction on an explicit omp.task. Unlike taskgroup task_reduction, the
+// task does not register a reduction; it participates in a reduction declared
+// by an enclosing taskgroup. The lowering must, inside the outlined task body:
+// 1. Obtain the executing thread's gtid via __kmpc_global_thread_num;
+// 2. Look up the per-task private storage via
+// __kmpc_task_reduction_get_th_data(gtid, null, orig) -- the NULL
+// descriptor makes the runtime walk up enclosing taskgroups to find the
+// matching task_reduction registration for `orig`;
+// 3. Use the returned private pointer for all updates in the task body, never
+// the original shared variable.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_single(%x : !llvm.ptr) {
+ omp.task in_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %s = llvm.add %v, %c1 : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// The encountering function must NOT register a reduction: no taskgroup, no
+// descriptor array, and no __kmpc_taskred_init for in_reduction.
+// CHECK-LABEL: define void @task_in_reduction_single(
+// CHECK-NOT: @__kmpc_taskred_init
+// CHECK-NOT: @__kmpc_taskgroup
+
+// Outlined task body looks up per-task storage via the runtime with a NULL
+// descriptor, and updates that private storage (not the original pointer).
+// CHECK-LABEL: define internal void @task_in_reduction_single..omp_par(
+// CHECK: %[[BODY_GEP:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[BODY_ORIG:.+]] = load ptr, ptr %[[BODY_GEP]]
+// CHECK: %[[BODY_GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID]], ptr null, ptr %[[BODY_ORIG]])
+// CHECK: %[[LD:.+]] = load i32, ptr %[[PRIV]]
+// CHECK: %[[ADD:.+]] = add i32 %[[LD]], 1
+// CHECK: store i32 %[[ADD]], ptr %[[PRIV]]
+
+// -----
+
+// Multiple in_reduction items: the body issues one
+// __kmpc_task_reduction_get_th_data per item, each with a NULL descriptor.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_multi(%x : !llvm.ptr, %y : !llvm.ptr) {
+ omp.task in_reduction(@add_i32 %x -> %px, @add_i32 %y -> %py : !llvm.ptr, !llvm.ptr) {
+ %vx = llvm.load %px : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %sx = llvm.add %vx, %c1 : i32
+ llvm.store %sx, %px : i32, !llvm.ptr
+ %vy = llvm.load %py : !llvm.ptr -> i32
+ %c2 = llvm.mlir.constant(2 : i32) : i32
+ %sy = llvm.add %vy, %c2 : i32
+ llvm.store %sy, %py : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// Each item is threaded through independently: the two original pointers come
+// from distinct slots of the task shareds aggregate, each is passed to its own
+// __kmpc_task_reduction_get_th_data lookup (NULL descriptor), and each item's
+// body load/store targets only the matching private pointer -- never the
+// original shared pointer.
+// CHECK-LABEL: define internal void @task_in_reduction_multi..omp_par(
+// CHECK: %[[GEP0:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[ORIG0:.+]] = load ptr, ptr %[[GEP0]]
+// CHECK: %[[GEP1:.+]] = getelementptr {{.+}}, i32 0, i32 1
+// CHECK: %[[ORIG1:.+]] = load ptr, ptr %[[GEP1]]
+// CHECK: %[[PRIV0:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %[[ORIG0]])
+// CHECK: %[[PRIV1:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %[[ORIG1]])
+// CHECK: %[[LDX:.+]] = load i32, ptr %[[PRIV0]]
+// CHECK: %[[ADDX:.+]] = add i32 %[[LDX]], 1
+// CHECK: store i32 %[[ADDX]], ptr %[[PRIV0]]
+// CHECK: %[[LDY:.+]] = load i32, ptr %[[PRIV1]]
+// CHECK: %[[ADDY:.+]] = add i32 %[[LDY]], 2
+// CHECK: store i32 %[[ADDY]], ptr %[[PRIV1]]
+// CHECK-NOT: store i32 %{{.+}}, ptr %[[ORIG0]]
+// CHECK-NOT: store i32 %{{.+}}, ptr %[[ORIG1]]
+
+// -----
+
+// Regression: a plain omp.task with no in_reduction must not emit any
+// __kmpc_task_reduction_get_th_data call.
+
+llvm.func @task_plain(%x : !llvm.ptr) {
+ omp.task {
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ llvm.store %c1, %x : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @task_plain(
+// CHECK-NOT: @__kmpc_task_reduction_get_th_data
+
+// -----
+
+// Nested case: an explicit omp.task carrying in_reduction inside an
+// omp.taskgroup that declares the matching task_reduction. Registration happens
+// once, on the enclosing taskgroup (__kmpc_taskred_init over a
+// kmp_taskred_input_t descriptor); the explicit task does not register its own
+// reduction. Inside the outlined task body the item is resolved with
+// __kmpc_task_reduction_get_th_data and a NULL descriptor, which makes the
+// runtime walk the enclosing taskgroup chain to find the registration.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_nested(%x : !llvm.ptr) {
+ omp.taskgroup task_reduction(@add_i32 %x -> %tgprv : !llvm.ptr) {
+ omp.task in_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %s = llvm.add %v, %c1 : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// The enclosing taskgroup registers the reduction and then spawns the explicit
+// task. The end_taskgroup block is emitted textually before the task-spawn
+// block, so it is matched first here.
+// CHECK-LABEL: define void @task_in_reduction_nested(
+// CHECK: %[[ARR:.+]] = alloca [1 x %kmp_taskred_input_t]
+// CHECK: call void @__kmpc_taskgroup(
+// CHECK: call ptr @__kmpc_taskred_init(i32 %{{.+}}, i32 1, ptr %[[ARR]])
+// CHECK: call void @__kmpc_end_taskgroup(
+// CHECK: call ptr @__kmpc_omp_task_alloc({{.+}}@task_in_reduction_nested..omp_par)
+// CHECK: call i32 @__kmpc_omp_task(
+
+// The outlined task body resolves the in_reduction item with a NULL descriptor
+// and updates only the private storage. It never registers its own reduction
+// and never writes back to the original shared pointer.
+// CHECK-LABEL: define internal void @task_in_reduction_nested..omp_par(
+// CHECK: %[[BODY_GEP:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[BODY_ORIG:.+]] = load ptr, ptr %[[BODY_GEP]]
+// CHECK: %[[BODY_GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID]], ptr null, ptr %[[BODY_ORIG]])
+// CHECK: %[[LD:.+]] = load i32, ptr %[[PRIV]]
+// CHECK: %[[ADD:.+]] = add i32 %[[LD]], 1
+// CHECK: store i32 %[[ADD]], ptr %[[PRIV]]
+// CHECK-NOT: store i32 %{{.+}}, ptr %[[BODY_ORIG]]
+// CHECK-NOT: call ptr @__kmpc_taskred_init
+
+// -----
+
+// Non-default address space: the in_reduction storage pointer lives in
+// addrspace(1). __kmpc_task_reduction_get_th_data takes and returns a generic,
+// default-addrspace ptr, so the original is addrspacecast to the generic space
+// before the lookup, and the returned private pointer is cast back to
+// addrspace(1) before the body uses it.
+
+omp.declare_reduction @add_i32_as1 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_as1(%x : !llvm.ptr<1>) {
+ omp.task in_reduction(@add_i32_as1 %x -> %prv : !llvm.ptr<1>) {
+ %v = llvm.load %prv : !llvm.ptr<1> -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %s = llvm.add %v, %c1 : i32
+ llvm.store %s, %prv : i32, !llvm.ptr<1>
+ omp.terminator
+ }
+ llvm.return
+}
+
+// The encountering function still registers nothing for in_reduction.
+// CHECK-LABEL: define void @task_in_reduction_as1(
+// CHECK-NOT: @__kmpc_taskred_init
+// CHECK-NOT: @__kmpc_taskgroup
+
+// The outlined body normalizes the addrspace(1) original to a generic ptr for
+// the runtime lookup, casts the returned private back to addrspace(1), and uses
+// that private pointer for the body load/store.
+// CHECK-LABEL: define internal void @task_in_reduction_as1..omp_par(
+// CHECK: %[[GEP:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[ORIG:.+]] = load ptr addrspace(1), ptr %[[GEP]]
+// CHECK: %[[GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[ORIG_CAST:.+]] = addrspacecast ptr addrspace(1) %[[ORIG]] to ptr
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[GTID]], ptr null, ptr %[[ORIG_CAST]])
+// CHECK: %[[PRIV_CAST:.+]] = addrspacecast ptr %[[PRIV]] to ptr addrspace(1)
+// CHECK: %[[LD:.+]] = load i32, ptr addrspace(1) %[[PRIV_CAST]]
+// CHECK: %[[ADD:.+]] = add i32 %[[LD]], 1
+// CHECK: store i32 %[[ADD]], ptr addrspace(1) %[[PRIV_CAST]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index eb32f7f66273a..377a5bb799be4 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -262,10 +262,10 @@ atomic {
llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
omp.yield
}
-llvm.func @task_in_reduction(%x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.task operation}}
+llvm.func @task_in_reduction_byref(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction with byref modifier in omp.task operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.task}}
- omp.task in_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
+ omp.task in_reduction(byref @add_f32 %x -> %prv : !llvm.ptr) {
omp.terminator
}
llvm.return
>From 1e0899794dd432c6c3c7bbba6940e8640e6a201c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 19 Jun 2026 07:24:21 +0100
Subject: [PATCH 04/10] [DAG] visitEXTRACT_SUBVECTOR - Fold
EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(X,C1),C0) with nonzero indices (#204533)
Removed equivalent fold from x86 and added generic DAG fold to replace
it - net zero test changes
Refactored version of #200935
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++--------
llvm/lib/Target/X86/X86ISelLowering.cpp | 8 --------
2 files changed, 8 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5a4ae64cb98af..1409c7b683069 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27656,18 +27656,18 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
return NarrowLoad;
// Combine an extract of an extract into a single extract_subvector.
- // ext (ext X, C), 0 --> ext X, C
- if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
+ // ext (ext X, C1), C2 --> ext X, C1 + C2
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
// Both indices must have the same scaling factor and C has to be a
// multiple of the new result type's known minimum vector length.
+ uint64_t InnerExtIdx = V.getConstantOperandVal(1);
+ uint64_t NewExtIdx = InnerExtIdx + ExtIdx;
if (V.getValueType().isScalableVector() == NVT.isScalableVector() &&
- V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 &&
+ NewExtIdx % NVT.getVectorMinNumElements() == 0 &&
TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
- V.getConstantOperandVal(1)) &&
- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0),
- V.getOperand(1));
- }
+ NewExtIdx) &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT))
+ return DAG.getExtractSubvector(DL, NVT, V.getOperand(0), NewExtIdx);
}
// ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b9a65e2671aa9..a5470d9735dba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -62023,14 +62023,6 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
- // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
- if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
- TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
- unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
- return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
- }
-
// EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
// --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
// iff SUB is entirely contained in the extraction.
>From 4b2a02d47e31da4764aa6e204e6c502ccc69e201 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 19 Jun 2026 07:27:27 +0100
Subject: [PATCH 05/10] [X86] Replace X86 specific PDEP/PEXT handling with
generic intrinsics (#204144)
* Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead
* AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics
* Move X86 DAG knownbits/demandedbits handling to generic (unchanged)
* Move X86 InstCombine folds to generic (unchanged)
* Add memory sanitizer handling for generic pdep/pext intrinsics
* Updated clang builtins to emit generics
Fixes #204537
---
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 10 +++
clang/test/CodeGen/X86/bmi2-builtins.c | 8 +-
llvm/include/llvm/IR/IntrinsicsX86.td | 12 ---
llvm/lib/Analysis/ConstantFolding.cpp | 10 +++
llvm/lib/Analysis/InstructionSimplify.cpp | 14 +++
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++
.../CodeGen/SelectionDAG/TargetLowering.cpp | 28 ++++++
llvm/lib/IR/AutoUpgrade.cpp | 8 ++
llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +----------
.../Target/X86/X86InstCombineIntrinsic.cpp | 88 -------------------
llvm/lib/Target/X86/X86InstrFragments.td | 4 -
llvm/lib/Target/X86/X86InstrMisc.td | 54 ++----------
llvm/lib/Target/X86/X86IntrinsicsInfo.h | 4 -
.../InstCombine/InstCombineCalls.cpp | 34 +++++++
.../Instrumentation/MemorySanitizer.cpp | 29 +++++-
llvm/test/CodeGen/X86/bmi2.ll | 23 +++--
.../Instrumentation/MemorySanitizer/bmi.ll | 16 ++--
.../Instrumentation/MemorySanitizer/pdep.ll | 35 +++++---
.../Instrumentation/MemorySanitizer/pext.ll | 35 +++++---
llvm/test/Transforms/InstCombine/pdep.ll | 30 +++----
llvm/test/Transforms/InstCombine/pext.ll | 30 +++----
22 files changed, 259 insertions(+), 287 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index acfeb9967cd2f..50125a71fcd5f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -976,6 +976,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
}
+ case X86::BI__builtin_ia32_pdep_si:
+ case X86::BI__builtin_ia32_pdep_di: {
+ Function *F = CGM.getIntrinsic(Intrinsic::pdep, Ops[0]->getType());
+ return Builder.CreateCall(F, Ops);
+ }
+ case X86::BI__builtin_ia32_pext_si:
+ case X86::BI__builtin_ia32_pext_di: {
+ Function *F = CGM.getIntrinsic(Intrinsic::pext, Ops[0]->getType());
+ return Builder.CreateCall(F, Ops);
+ }
case X86::BI__builtin_ia32_undef128:
case X86::BI__builtin_ia32_undef256:
case X86::BI__builtin_ia32_undef512:
diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c
index 1b2cb9048adb2..c83cc43d9fc3f 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -17,12 +17,12 @@ unsigned int test_bzhi_u32(unsigned int __X, unsigned int __Y) {
}
unsigned int test_pdep_u32(unsigned int __X, unsigned int __Y) {
- // CHECK: @llvm.x86.bmi.pdep.32
+ // CHECK: @llvm.pdep.i32
return _pdep_u32(__X, __Y);
}
unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) {
- // CHECK: @llvm.x86.bmi.pext.32
+ // CHECK: @llvm.pext.i32
return _pext_u32(__X, __Y);
}
@@ -41,12 +41,12 @@ unsigned long long test_bzhi_u64(unsigned long long __X, unsigned long long __Y)
}
unsigned long long test_pdep_u64(unsigned long long __X, unsigned long long __Y) {
- // CHECK: @llvm.x86.bmi.pdep.64
+ // CHECK: @llvm.pdep.i64
return _pdep_u64(__X, __Y);
}
unsigned long long test_pext_u64(unsigned long long __X, unsigned long long __Y) {
- // CHECK: @llvm.x86.bmi.pext.64
+ // CHECK: @llvm.pext.i64
return _pext_u64(__X, __Y);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b75a0485d6263..5c7785731111c 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2575,18 +2575,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">,
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
[IntrNoMem]>;
- def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">,
- DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem]>;
- def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">,
- DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
- [IntrNoMem]>;
- def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">,
- DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem]>;
- def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">,
- DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
- [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 3fe78d6c4322d..f18b7a0b66a21 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1756,6 +1756,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
case Intrinsic::fshl:
case Intrinsic::fshr:
case Intrinsic::clmul:
+ case Intrinsic::pdep:
+ case Intrinsic::pext:
case Intrinsic::launder_invariant_group:
case Intrinsic::strip_invariant_group:
case Intrinsic::masked_load:
@@ -3904,6 +3906,14 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
if (!C0 || !C1)
return Constant::getNullValue(Ty);
return ConstantInt::get(Ty, APIntOps::clmul(*C0, *C1));
+ case Intrinsic::pdep:
+ if (!C0 || !C1)
+ return Constant::getNullValue(Ty);
+ return ConstantInt::get(Ty, APIntOps::expandBits(*C0, *C1));
+ case Intrinsic::pext:
+ if (!C0 || !C1)
+ return Constant::getNullValue(Ty);
+ return ConstantInt::get(Ty, APIntOps::compressBits(*C0, *C1));
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_max:
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 7698d0d772a94..3b20592bcaed2 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6930,6 +6930,20 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
return Constant::getNullValue(ReturnType);
break;
}
+ case Intrinsic::pdep: {
+ if (match(Op1, m_Zero()))
+ return Constant::getNullValue(ReturnType);
+ if (match(Op1, m_AllOnes()))
+ return Op0;
+ break;
+ }
+ case Intrinsic::pext: {
+ if (match(Op1, m_Zero()))
+ return Constant::getNullValue(ReturnType);
+ if (match(Op1, m_AllOnes()))
+ return Op0;
+ break;
+ }
case Intrinsic::ptrmask: {
// NOTE: We can't apply this simplifications based on the value of Op1
// because we need to preserve provenance.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1409c7b683069..4fdef7d4afb5d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12245,12 +12245,18 @@ SDValue DAGCombiner::visitPDEP(SDNode *N) {
// pdep(x, 0) -> 0
if (isNullOrNullSplat(N1))
return DAG.getConstant(0, DL, VT);
+
// pdep(x, -1) -> x (all positions selected, bits deposited at identity)
if (isAllOnesOrAllOnesSplat(N1))
return N0;
+
// fold pdep(c1, c2) -> expandBits(c1, c2)
if (SDValue C = DAG.FoldConstantArithmetic(ISD::PDEP, DL, VT, {N0, N1}))
return C;
+
+ if (SimplifyDemandedBits(SDValue(N, 0)))
+ return SDValue(N, 0);
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b32c16fe4300f..44120cceed2a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3952,6 +3952,24 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known.Zero.setBitsFrom(1);
break;
}
+ case ISD::PDEP: {
+ Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ // Zeros are retained from the mask operand. But not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ break;
+ }
+ case ISD::PEXT: {
+ Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // The result has as many leading zeros as the number of zeroes in the mask.
+ unsigned Count = Known.Zero.popcount();
+ Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+ Known.One.clearAllBits();
+ break;
+ }
case ISD::CLMUL: {
Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5ba36495ba4f6..5772ef37ec762 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2463,6 +2463,34 @@ bool TargetLowering::SimplifyDemandedBits(
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
break;
}
+ case ISD::PDEP: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
+ APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+ // If the demanded bits has leading zeroes, we don't demand those from the
+ // mask.
+ if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+ return true;
+
+ // The number of possible 1s in the mask determines the number of LSBs of
+ // operand 0 used. Undemanded bits from the mask don't matter so filter
+ // them before counting.
+ KnownBits Known2;
+ uint64_t Count = (~Known.Zero & LoMask).popcount();
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+ return true;
+
+ // Zeroes are retained from the mask, but not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ break;
+ }
case ISD::SIGN_EXTEND_INREG: {
SDValue Op0 = Op.getOperand(0);
EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 4d353c95b8930..3a823f906b012 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -533,6 +533,10 @@ static bool shouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name.starts_with("vpcom") || // Added in 3.2, Updated in 9.0
Name.starts_with("vprot")); // Added in 8.0
+ if (Name.consume_front("bmi."))
+ return (Name.starts_with("pdep.") || // Added in 23.0
+ Name.starts_with("pext.")); // Added in 23.0
+
return (Name == "addcarry.u32" || // Added in 8.0
Name == "addcarry.u64" || // Added in 8.0
Name == "addcarryx.u32" || // Added in 8.0
@@ -4618,6 +4622,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
} else if (Name.starts_with("avx512.mask.") &&
upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
// Rep will be updated by the call in the condition.
+ } else if (Name.starts_with("bmi.pdep.")) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pdep);
+ } else if (Name.starts_with("bmi.pext.")) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pext);
} else
reportFatalUsageErrorWithCI("Unexpected intrinsic", CI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a5470d9735dba..e9ba1c05df361 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39748,25 +39748,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.One.clearAllBits();
break;
}
- case X86ISD::PDEP: {
- KnownBits Known2;
- Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- // Zeros are retained from the mask operand. But not ones.
- Known.One.clearAllBits();
- // The result will have at least as many trailing zeros as the non-mask
- // operand since bits can only map to the same or higher bit position.
- Known.Zero.setLowBits(Known2.countMinTrailingZeros());
- break;
- }
- case X86ISD::PEXT: {
- Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- // The result has as many leading zeros as the number of zeroes in the mask.
- unsigned Count = Known.Zero.popcount();
- Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
- Known.One.clearAllBits();
- break;
- }
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS:
@@ -46015,34 +45996,6 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
break;
}
- case X86ISD::PDEP: {
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
-
- unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
- APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
-
- // If the demanded bits has leading zeroes, we don't demand those from the
- // mask.
- if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
- return true;
-
- // The number of possible 1s in the mask determines the number of LSBs of
- // operand 0 used. Undemanded bits from the mask don't matter so filter
- // them before counting.
- KnownBits Known2;
- uint64_t Count = (~Known.Zero & LoMask).popcount();
- APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
- if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
- return true;
-
- // Zeroes are retained from the mask, but not ones.
- Known.One.clearAllBits();
- // The result will have at least as many trailing zeros as the non-mask
- // operand since bits can only map to the same or higher bit position.
- Known.Zero.setLowBits(Known2.countMinTrailingZeros());
- return false;
- }
case X86ISD::VPMADD52L:
case X86ISD::VPMADD52H: {
KnownBits KnownOp0, KnownOp1, KnownOp2;
@@ -63415,8 +63368,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
case X86ISD::BEXTR:
case X86ISD::BEXTRI:
- case X86ISD::BZHI:
- case X86ISD::PDEP: return combineBMI(N, DAG, DCI);
+ case X86ISD::BZHI: return combineBMI(N, DAG, DCI);
case X86ISD::PCLMULQDQ: return combinePCLMULQDQ(N, DAG, DCI);
case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 4999581489e82..ad1c171428671 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2259,94 +2259,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// TODO should we convert this to an AND if the RHS is constant?
}
break;
- case Intrinsic::x86_bmi_pext_32:
- case Intrinsic::x86_bmi_pext_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
-
- unsigned MaskIdx, MaskLen;
- if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- Value *Input = II.getArgOperand(0);
- Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
- Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
- Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
- return IC.replaceInstUsesWith(II, Shifted);
- }
-
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToSet = 1;
-
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToTest = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
-
- BitToSet <<= 1;
- // Clear lowest set bit.
- Mask &= Mask - 1;
- }
-
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
- case Intrinsic::x86_bmi_pdep_32:
- case Intrinsic::x86_bmi_pdep_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
-
- unsigned MaskIdx, MaskLen;
- if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- Value *Input = II.getArgOperand(0);
- Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
- Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
- Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
- return IC.replaceInstUsesWith(II, Masked);
- }
-
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToTest = 1;
-
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToSet = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
-
- BitToTest <<= 1;
- // Clear lowest set bit;
- Mask &= Mask - 1;
- }
-
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 9316360c5e02a..923b968382866 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -424,10 +424,6 @@ def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
// Zero High Bits Starting with Specified Bit Position.
def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;
-// Parallel extract and deposit.
-def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>;
-def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>;
-
// X86-specific multiply by immediate.
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 613a431fe365a..c6acaa697fdc7 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1391,55 +1391,17 @@ multiclass PdepPext<string m, X86TypeInfo t, SDPatternOperator node,
}
let Predicates = [HasBMI2, NoEGPR] in {
- defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep>, XD, VEX;
- defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX;
- defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX;
- defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX;
+ defm PDEP32 : PdepPext<"pdep", Xi32, pdep>, XD, VEX;
+ defm PDEP64 : PdepPext<"pdep", Xi64, pdep>, XD, REX_W, VEX;
+ defm PEXT32 : PdepPext<"pext", Xi32, pext>, XS, VEX;
+ defm PEXT64 : PdepPext<"pext", Xi64, pext>, XS, REX_W, VEX;
}
let Predicates = [HasBMI2, HasEGPR] in {
- defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX;
- defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX;
- defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX;
- defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX;
-}
-
-let Predicates = [HasBMI2, NoEGPR] in {
- def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
- (PEXT32rr GR32:$src, GR32:$mask)>;
- def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
- (PEXT32rm GR32:$src, i32mem:$mask)>;
- def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
- (PEXT64rr GR64:$src, GR64:$mask)>;
- def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
- (PEXT64rm GR64:$src, i64mem:$mask)>;
- def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
- (PDEP32rr GR32:$src, GR32:$mask)>;
- def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
- (PDEP32rm GR32:$src, i32mem:$mask)>;
- def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
- (PDEP64rr GR64:$src, GR64:$mask)>;
- def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
- (PDEP64rm GR64:$src, i64mem:$mask)>;
-}
-
-let Predicates = [HasBMI2, HasEGPR] in {
- def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
- (PEXT32rr_EVEX GR32:$src, GR32:$mask)>;
- def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
- (PEXT32rm_EVEX GR32:$src, i32mem:$mask)>;
- def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
- (PEXT64rr_EVEX GR64:$src, GR64:$mask)>;
- def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
- (PEXT64rm_EVEX GR64:$src, i64mem:$mask)>;
- def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
- (PDEP32rr_EVEX GR32:$src, GR32:$mask)>;
- def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
- (PDEP32rm_EVEX GR32:$src, i32mem:$mask)>;
- def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
- (PDEP64rr_EVEX GR64:$src, GR64:$mask)>;
- def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
- (PDEP64rm_EVEX GR64:$src, i64mem:$mask)>;
+ defm PDEP32 : PdepPext<"pdep", Xi32, pdep, "_EVEX">, XD, EVEX;
+ defm PDEP64 : PdepPext<"pdep", Xi64, pdep, "_EVEX">, XD, REX_W, EVEX;
+ defm PEXT32 : PdepPext<"pext", Xi32, pext, "_EVEX">, XS, EVEX;
+ defm PEXT64 : PdepPext<"pext", Xi64, pext, "_EVEX">, XS, REX_W, EVEX;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 9e32ca23dafe2..a6b0db0230cf3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1837,10 +1837,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0),
- X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0),
- X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0),
- X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0),
- X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
0),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3cd7515eb7670..1df156053e302 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2660,6 +2660,40 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
return &CI;
break;
}
+ case Intrinsic::pdep: {
+ const APInt *MaskC;
+ if (match(II->getArgOperand(1), m_APInt(MaskC))) {
+ unsigned MaskIdx, MaskLen;
+ if (MaskC->isShiftedMask(MaskIdx, MaskLen)) {
+ // any single contiguous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ Value *Input = II->getArgOperand(0);
+ Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+ Value *Shifted = Builder.CreateShl(Input, ShiftAmt);
+ Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1));
+ return replaceInstUsesWith(*II, Masked);
+ }
+ }
+ break;
+ }
+ case Intrinsic::pext: {
+ const APInt *MaskC;
+ if (match(II->getArgOperand(1), m_APInt(MaskC))) {
+ unsigned MaskIdx, MaskLen;
+ if (MaskC->isShiftedMask(MaskIdx, MaskLen)) {
+ // any single contiguous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ Value *Input = II->getArgOperand(0);
+ Value *Masked = Builder.CreateAnd(Input, II->getArgOperand(1));
+ Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+ Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt);
+ return replaceInstUsesWith(*II, Shifted);
+ }
+ }
+ break;
+ }
case Intrinsic::ptrmask: {
unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType());
KnownBits Known(BitWidth);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index bbc9f5d1b7506..f37e21f2c6dbb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3333,6 +3333,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Instrument packed bits deposit/expand intrinsics.
+ // All of these intrinsics are Z = I(X, Y)
+ // where the types of all operands and the result match.
+ // The following instrumentation happens to work for all of them:
+ // Sz = I(Sx, Y) | (sext (Sy != 0))
+ void handlePackedBits(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Type *ShadowTy = getShadowTy(&I);
+
+ // If any bit of the mask operand is poisoned, then the whole thing is.
+ Value *SMask = getShadow(&I, 1);
+ SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
+ ShadowTy);
+ // Apply the same intrinsic to the shadow of the first operand.
+ Value *S = IRB.CreateIntrinsic(I.getIntrinsicID(), ShadowTy,
+ {getShadow(&I, 0), I.getOperand(1)});
+ setShadow(&I, IRB.CreateOr(SMask, S));
+ setOriginForNaryOp(I);
+ }
+
/// Instrument llvm.memmove
///
/// At this point we don't know if llvm.memmove will be inlined or not.
@@ -5873,6 +5893,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
handleFunnelShift(I);
break;
+ case Intrinsic::pdep:
+ case Intrinsic::pext:
+ handlePackedBits(I);
+ break;
+
case Intrinsic::is_constant:
// The result of llvm.is.constant() is always defined.
setShadow(&I, getCleanShadow(&I));
@@ -6503,10 +6528,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_bmi_bextr_64:
case Intrinsic::x86_bmi_bzhi_32:
case Intrinsic::x86_bmi_bzhi_64:
- case Intrinsic::x86_bmi_pdep_32:
- case Intrinsic::x86_bmi_pdep_64:
- case Intrinsic::x86_bmi_pext_32:
- case Intrinsic::x86_bmi_pext_64:
handleBmiIntrinsic(I);
break;
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index cabeebb0c3f36..41585bde9a696 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -128,7 +128,7 @@ define i32 @pdep32_load(i32 %x, ptr %y) {
define i32 @pdep32_anyext(i16 %x) {
; X86-LABEL: pdep32_anyext:
; X86: # %bb.0:
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: pdepl %ecx, %eax, %eax
; X86-NEXT: retl
@@ -178,7 +178,7 @@ define i32 @pdep32_demandedbits(i32 %x) {
define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
; X86-LABEL: pdep32_demandedbits2:
; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax
; X86-NEXT: andl $128, %eax
; X86-NEXT: retl
@@ -203,9 +203,8 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
; X86-LABEL: pdep32_demandedbits_mask:
; X86: # %bb.0:
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: pdepl %eax, %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax
; X86-NEXT: andl $32768, %eax # imm = 0x8000
; X86-NEXT: retl
;
@@ -230,9 +229,8 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
; X86-LABEL: pdep32_demandedbits_mask2:
; X86: # %bb.0:
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: pdepl %eax, %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: retl
;
@@ -285,22 +283,23 @@ define i32 @pdep32_knownbits(i32 %x) {
define i32 @pdep32_knownbits2(i32 %x, i32 %y) {
; X86-LABEL: pdep32_knownbits2:
; X86: # %bb.0:
-; X86-NEXT: movl $-256, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $8, %eax
; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax
; X86-NEXT: imull %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: pdep32_knownbits2:
; X64: # %bb.0:
-; X64-NEXT: andl $-256, %edi
+; X64-NEXT: andl $16776960, %edi # imm = 0xFFFF00
; X64-NEXT: pdepl %esi, %edi, %eax
; X64-NEXT: imull %eax, %eax
; X64-NEXT: retq
;
; EGPR-LABEL: pdep32_knownbits2:
; EGPR: # %bb.0:
-; EGPR-NEXT: andl $-256, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0xff]
+; EGPR-NEXT: andl $16776960, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0x00]
+; EGPR-NEXT: # imm = 0xFFFF00
; EGPR-NEXT: pdepl %esi, %edi, %eax # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x43,0xf5,0xc6]
; EGPR-NEXT: imull %eax, %eax # encoding: [0x0f,0xaf,0xc0]
; EGPR-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index 46bec2956c73c..208546ec56246 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -110,9 +110,9 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP1]], i32 [[B]])
; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.x86.bmi.pdep.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]])
; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[C]]
;
@@ -131,9 +131,9 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP1]], i64 [[B]])
; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.x86.bmi.pdep.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT: [[C:%.*]] = call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]])
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[C]]
;
@@ -152,9 +152,9 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pext.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP1]], i32 [[B]])
; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.x86.bmi.pext.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]])
; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[C]]
;
@@ -173,9 +173,9 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pext.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP1]], i64 [[B]])
; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.x86.bmi.pext.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT: [[C:%.*]] = call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]])
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[C]]
;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
index 5a94f6abfa773..f323f386d0f50 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
@@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu"
define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory {
; CHECK-LABEL: define i8 @Test_pdep_8(
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.pdep.i8(i8 [[TMP2]], i8 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i8 @llvm.pdep.i8(i8 [[A]], i8 [[B]])
; CHECK-NEXT: store i8 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i8 [[C]]
@@ -23,10 +26,13 @@ define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory {
define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory {
; CHECK-LABEL: define i16 @Test_pdep_16(
; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.pdep.i16(i16 [[TMP2]], i16 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i16 @llvm.pdep.i16(i16 [[A]], i16 [[B]])
; CHECK-NEXT: store i16 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i16 [[C]]
@@ -39,10 +45,13 @@ define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory {
define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
; CHECK-LABEL: define i32 @Test_pdep_32(
; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP2]], i32 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]])
; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[C]]
@@ -55,10 +64,13 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
; CHECK-LABEL: define i64 @Test_pdep_64(
; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP2]], i64 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]])
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[C]]
@@ -70,10 +82,13 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
define i128 @Test_pdep_128(i128 %a, i128 %b) sanitize_memory {
; CHECK-LABEL: define i128 @Test_pdep_128(
; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = call i128 @llvm.pdep.i128(i128 [[TMP2]], i128 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i128 @llvm.pdep.i128(i128 [[A]], i128 [[B]])
; CHECK-NEXT: store i128 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i128 [[C]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/pext.ll b/llvm/test/Instrumentation/MemorySanitizer/pext.ll
index 72c4834998446..2caf6a47ac93b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/pext.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/pext.ll
@@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu"
define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory {
; CHECK-LABEL: define i8 @Test_pext_8(
; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.pext.i8(i8 [[TMP2]], i8 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i8 @llvm.pext.i8(i8 [[A]], i8 [[B]])
; CHECK-NEXT: store i8 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i8 [[C]]
@@ -23,10 +26,13 @@ define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory {
define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory {
; CHECK-LABEL: define i16 @Test_pext_16(
; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.pext.i16(i16 [[TMP2]], i16 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i16 @llvm.pext.i16(i16 [[A]], i16 [[B]])
; CHECK-NEXT: store i16 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i16 [[C]]
@@ -39,10 +45,13 @@ define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory {
define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
; CHECK-LABEL: define i32 @Test_pext_32(
; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP2]], i32 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]])
; CHECK-NEXT: store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[C]]
@@ -55,10 +64,13 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
; CHECK-LABEL: define i64 @Test_pext_64(
; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP2]], i64 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]])
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[C]]
@@ -70,10 +82,13 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
define i128 @Test_pext_128(i128 %a, i128 %b) sanitize_memory {
; CHECK-LABEL: define i128 @Test_pext_128(
; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT: [[TMP5:%.*]] = call i128 @llvm.pext.i128(i128 [[TMP2]], i128 [[B]])
+; CHECK-NEXT: [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[C:%.*]] = tail call i128 @llvm.pext.i128(i128 [[A]], i128 [[B]])
; CHECK-NEXT: store i128 [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i128 [[C]]
diff --git a/llvm/test/Transforms/InstCombine/pdep.ll b/llvm/test/Transforms/InstCombine/pdep.ll
index ceb4d1f97b6b0..b726e87a6168c 100644
--- a/llvm/test/Transforms/InstCombine/pdep.ll
+++ b/llvm/test/Transforms/InstCombine/pdep.ll
@@ -3,8 +3,7 @@
define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_32_zero_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 0)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 0
;
%1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 0)
ret i32 %1
@@ -12,8 +11,7 @@ define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone {
define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_64_zero_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 0)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 0
;
%1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 0)
ret i64 %1
@@ -21,8 +19,7 @@ define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone {
define i32 @test_pdep_32_allones_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_32_allones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 -1)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP1:%.*]]
;
%1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 -1)
ret i32 %1
@@ -30,8 +27,7 @@ define i32 @test_pdep_32_allones_mask(i32 %x) nounwind readnone {
define i64 @test_pdep_64_allones_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_64_allones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 -1)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 [[TMP1:%.*]]
;
%1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 -1)
ret i64 %1
@@ -39,7 +35,8 @@ define i64 @test_pdep_64_allones_mask(i64 %x) nounwind readnone {
define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_32_shifted_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], i32 12)
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[X:%.*]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP2]], 12
; CHECK-NEXT: ret i32 [[TMP1]]
;
%1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 12)
@@ -48,7 +45,8 @@ define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind readnone {
define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pdep_64_shifted_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], i64 12)
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[X:%.*]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP2]], 12
; CHECK-NEXT: ret i64 [[TMP1]]
;
%1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 12)
@@ -57,8 +55,7 @@ define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind readnone {
define i32 @test_pdep_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_pdep_32_constant_fold(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 -252645136)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 807407616
;
%1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4042322160)
ret i32 %1
@@ -66,8 +63,7 @@ define i32 @test_pdep_32_constant_fold() nounwind readnone {
define i64 @test_pdep_64_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_pdep_64_constant_fold(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -1085102592571150096)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 -1089641583808049024
;
%1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -1085102592571150096)
ret i64 %1
@@ -75,8 +71,7 @@ define i64 @test_pdep_64_constant_fold() nounwind readnone {
define i32 @test_pdep_32_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_pdep_32_constant_fold_2(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 -16776961)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 838860816
;
%1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4278190335)
ret i32 %1
@@ -84,8 +79,7 @@ define i32 @test_pdep_32_constant_fold_2() nounwind readnone {
define i64 @test_pdep_64_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_pdep_64_constant_fold_2(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -72056498804490496)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 -144114243170822144
;
%1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 -72056498804490496)
ret i64 %1
diff --git a/llvm/test/Transforms/InstCombine/pext.ll b/llvm/test/Transforms/InstCombine/pext.ll
index 52baa9a171c62..0f13f3f542023 100644
--- a/llvm/test/Transforms/InstCombine/pext.ll
+++ b/llvm/test/Transforms/InstCombine/pext.ll
@@ -3,8 +3,7 @@
define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_32_zero_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 0)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 0
;
%1 = tail call i32 @llvm.pext.i32(i32 %x, i32 0)
ret i32 %1
@@ -12,8 +11,7 @@ define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone {
define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_64_zero_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 0)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 0
;
%1 = tail call i64 @llvm.pext.i64(i64 %x, i64 0)
ret i64 %1
@@ -21,8 +19,7 @@ define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone {
define i32 @test_pext_32_allones_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_32_allones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 -1)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 [[TMP1:%.*]]
;
%1 = tail call i32 @llvm.pext.i32(i32 %x, i32 -1)
ret i32 %1
@@ -30,8 +27,7 @@ define i32 @test_pext_32_allones_mask(i32 %x) nounwind readnone {
define i64 @test_pext_64_allones_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_64_allones_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 -1)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 [[TMP1:%.*]]
;
%1 = tail call i64 @llvm.pext.i64(i64 %x, i64 -1)
ret i64 %1
@@ -39,7 +35,8 @@ define i64 @test_pext_64_allones_mask(i64 %x) nounwind readnone {
define i32 @test_pext_32_shifted_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_32_shifted_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], i32 6)
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP2]], 3
; CHECK-NEXT: ret i32 [[TMP1]]
;
%1 = tail call i32 @llvm.pext.i32(i32 %x, i32 6)
@@ -48,7 +45,8 @@ define i32 @test_pext_32_shifted_mask(i32 %x) nounwind readnone {
define i64 @test_pext_64_shifted_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_pext_64_shifted_mask(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], i64 6)
+; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[X:%.*]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP2]], 3
; CHECK-NEXT: ret i64 [[TMP1]]
;
%1 = tail call i64 @llvm.pext.i64(i64 %x, i64 6)
@@ -58,8 +56,7 @@ define i64 @test_pext_64_shifted_mask(i64 %x) nounwind readnone {
define i32 @test_pext_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_pext_32_constant_fold(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, i32 -252645136)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 30001
;
%1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4042322160)
ret i32 %1
@@ -67,8 +64,7 @@ define i32 @test_pext_32_constant_fold() nounwind readnone {
define i64 @test_pext_64_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_pext_64_constant_fold(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -1085102592571150096)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 1966210489
;
%1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -1085102592571150096)
ret i64 %1
@@ -76,8 +72,7 @@ define i64 @test_pext_64_constant_fold() nounwind readnone {
define i32 @test_pext_32_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_pext_32_constant_fold_2(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, i32 -16776961)
-; CHECK-NEXT: ret i32 [[TMP1]]
+; CHECK-NEXT: ret i32 30224
;
%1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4278190335)
ret i32 %1
@@ -85,8 +80,7 @@ define i32 @test_pext_32_constant_fold_2() nounwind readnone {
define i64 @test_pext_64_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_pext_64_constant_fold_2(
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -72056498804490496)
-; CHECK-NEXT: ret i64 [[TMP1]]
+; CHECK-NEXT: ret i64 1980816570
;
%1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 -72056498804490496)
ret i64 %1
>From f0134cc7a5a56b53dfbe2887cd759806845c8797 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 19 Jun 2026 08:29:03 +0200
Subject: [PATCH 06/10] AMDGPU: Add subtarget feature for controllable xnack
modes (#204523)
This replaces the previously removed xnack-any-only feature,
with the inversion xnack-on-off-modes. All pre-gfx12.5 xnack
targets support the controllable mode. Ignore explicitly
set xnack settings the same way as is done for xnack requests
on other unsupported targets.
---
clang/lib/Basic/TargetID.cpp | 3 +-
clang/lib/Driver/ToolChains/AMDGPU.cpp | 7 +--
clang/test/Driver/invalid-target-id.cl | 21 +++++++++
.../llvm/TargetParser/AMDGPUTargetParser.def | 44 +++++++++----------
.../llvm/TargetParser/AMDGPUTargetParser.h | 4 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 20 ++++++---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 +
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +-
.../MCTargetDesc/AMDGPUTargetStreamer.h | 10 +----
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 24 +++++-----
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 +-
.../AMDGPU/target-id-xnack-always-on.ll | 22 ++++++++++
12 files changed, 105 insertions(+), 58 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll
diff --git a/clang/lib/Basic/TargetID.cpp b/clang/lib/Basic/TargetID.cpp
index 6d9ba55ccd9d7..67f429607ef27 100644
--- a/clang/lib/Basic/TargetID.cpp
+++ b/clang/lib/Basic/TargetID.cpp
@@ -32,7 +32,8 @@ getAllPossibleAMDGPUTargetIDFeatures(const llvm::Triple &T,
: llvm::AMDGPU::getArchAttrR600(ProcKind);
if (Features & llvm::AMDGPU::FEATURE_SRAMECC)
Ret.push_back("sramecc");
- if (Features & llvm::AMDGPU::FEATURE_XNACK)
+ // Only allow xnack in target ID if the processor supports on/off modes.
+ if (Features & llvm::AMDGPU::FEATURE_XNACK_ON_OFF_MODES)
Ret.push_back("xnack");
return Ret;
}
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index ddc26604a8006..b57579f135b36 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -1130,9 +1130,10 @@ static bool isXnackAvailable(const llvm::Triple &TT, llvm::StringRef TargetID) {
auto Features = TT.isAMDGCN() ? llvm::AMDGPU::getArchAttrAMDGCN(ProcKind)
: llvm::AMDGPU::getArchAttrR600(ProcKind);
- // If processor has xnack always on, Address sanitizer is supported
- bool XnackAvailable = (Features & llvm::AMDGPU::FEATURE_XNACK_ALWAYS);
- if (XnackAvailable)
+ // If processor has xnack but doesn't support on/off modes, xnack is always on
+ bool XnackAlwaysOn = (Features & llvm::AMDGPU::FEATURE_XNACK) &&
+ !(Features & llvm::AMDGPU::FEATURE_XNACK_ON_OFF_MODES);
+ if (XnackAlwaysOn)
return true;
// Otherwise, check if xnack+ is explicitly enabled in the target ID
diff --git a/clang/test/Driver/invalid-target-id.cl b/clang/test/Driver/invalid-target-id.cl
index 4f6f140437885..f93e618e460be 100644
--- a/clang/test/Driver/invalid-target-id.cl
+++ b/clang/test/Driver/invalid-target-id.cl
@@ -39,3 +39,24 @@
// RUN: %s 2>&1 | FileCheck -check-prefix=NOCOLON %s
// NOCOLON: error: invalid target ID 'gfx900+xnack'
+
+// gfx1250 and gfx12-5-generic do not support xnack on/off modes
+// RUN: not %clang -target amdgcn-amd-amdhsa \
+// RUN: -mcpu=gfx1250:xnack+ -nostdlib \
+// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX1250 %s
+
+// RUN: not %clang -target amdgcn-amd-amdhsa \
+// RUN: -mcpu=gfx1250:xnack- -nostdlib \
+// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX1250 %s
+
+// XNACK-MODE-GFX1250: error: invalid target ID 'gfx1250:xnack{{[+-]}}'
+
+// RUN: not %clang -target amdgcn-amd-amdhsa \
+// RUN: -mcpu=gfx12-5-generic:xnack+ -nostdlib \
+// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX125 %s
+
+// RUN: not %clang -target amdgcn-amd-amdhsa \
+// RUN: -mcpu=gfx12-5-generic:xnack- -nostdlib \
+// RUN: %s 2>&1 | FileCheck -check-prefix=XNACK-MODE-GFX125 %s
+
+// XNACK-MODE-GFX125: error: invalid target ID 'gfx12-5-generic:xnack{{[+-]}}'
diff --git a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def
index d15fc01f30019..dcc0c28b1ee74 100644
--- a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def
+++ b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.def
@@ -76,7 +76,7 @@ AMDGCN_GPU_ALIAS("mullins", GK_GFX703)
AMDGCN_GPU ("gfx704", GK_GFX704, ( 7, 0, 4), FEATURE_NONE)
AMDGCN_GPU_ALIAS("bonaire", GK_GFX704)
AMDGCN_GPU ("gfx705", GK_GFX705, ( 7, 0, 5), FEATURE_NONE)
-AMDGCN_GPU ("gfx801", GK_GFX801, ( 8, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
+AMDGCN_GPU ("gfx801", GK_GFX801, ( 8, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
AMDGCN_GPU_ALIAS("carrizo", GK_GFX801)
AMDGCN_GPU ("gfx802", GK_GFX802, ( 8, 0, 2), FEATURE_FAST_DENORMAL_F32)
AMDGCN_GPU_ALIAS("iceland", GK_GFX802)
@@ -87,22 +87,22 @@ AMDGCN_GPU_ALIAS("polaris10", GK_GFX803)
AMDGCN_GPU_ALIAS("polaris11", GK_GFX803)
AMDGCN_GPU ("gfx805", GK_GFX805, ( 8, 0, 5), FEATURE_FAST_DENORMAL_F32)
AMDGCN_GPU_ALIAS("tongapro", GK_GFX805)
-AMDGCN_GPU ("gfx810", GK_GFX810, ( 8, 1, 0), FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
+AMDGCN_GPU ("gfx810", GK_GFX810, ( 8, 1, 0), FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
AMDGCN_GPU_ALIAS("stoney", GK_GFX810)
-AMDGCN_GPU ("gfx900", GK_GFX900, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx902", GK_GFX902, ( 9, 0, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx904", GK_GFX904, ( 9, 0, 4), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx906", GK_GFX906, ( 9, 0, 6), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx908", GK_GFX908, ( 9, 0, 8), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx909", GK_GFX909, ( 9, 0, 9), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx90a", GK_GFX90A, ( 9, 0, 10), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx90c", GK_GFX90C, ( 9, 0, 12), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx942", GK_GFX942, ( 9, 4, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx950", GK_GFX950, ( 9, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx1010", GK_GFX1010, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP)
-AMDGCN_GPU ("gfx1011", GK_GFX1011, (10, 1, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP)
-AMDGCN_GPU ("gfx1012", GK_GFX1012, (10, 1, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP)
-AMDGCN_GPU ("gfx1013", GK_GFX1013, (10, 1, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP)
+AMDGCN_GPU ("gfx900", GK_GFX900, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx902", GK_GFX902, ( 9, 0, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx904", GK_GFX904, ( 9, 0, 4), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx906", GK_GFX906, ( 9, 0, 6), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx908", GK_GFX908, ( 9, 0, 8), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx909", GK_GFX909, ( 9, 0, 9), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx90a", GK_GFX90A, ( 9, 0, 10), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx90c", GK_GFX90C, ( 9, 0, 12), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx942", GK_GFX942, ( 9, 4, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx950", GK_GFX950, ( 9, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx1010", GK_GFX1010, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP)
+AMDGCN_GPU ("gfx1011", GK_GFX1011, (10, 1, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP)
+AMDGCN_GPU ("gfx1012", GK_GFX1012, (10, 1, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP)
+AMDGCN_GPU ("gfx1013", GK_GFX1013, (10, 1, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP)
AMDGCN_GPU ("gfx1030", GK_GFX1030, (10, 3, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx1031", GK_GFX1031, (10, 3, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx1032", GK_GFX1032, (10, 3, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
@@ -123,8 +123,8 @@ AMDGCN_GPU ("gfx1171", GK_GFX1171, (11, 7, 1), FEATURE_FAST_FMA_F32|FEAT
AMDGCN_GPU ("gfx1172", GK_GFX1172, (11, 7, 2), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx1200", GK_GFX1200, (12, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx1201", GK_GFX1201, (12, 0, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
-AMDGCN_GPU ("gfx1250", GK_GFX1250, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx1251", GK_GFX1251, (12, 5, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx1250", GK_GFX1250, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx1251", GK_GFX1251, (12, 5, 1), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_SRAMECC)
AMDGCN_GPU ("gfx1310", GK_GFX1310, (13, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
// Generic targets return the lowest common denominator
@@ -140,13 +140,13 @@ AMDGCN_GPU ("gfx1310", GK_GFX1310, (13, 1, 0), FEATURE_FAST_FMA_F32|FEAT
//
// TODO: Split up this API depending on its caller so
// generic target handling is more obvious and less risky.
-AMDGCN_GPU ("gfx9-generic", GK_GFX9_GENERIC, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK)
-AMDGCN_GPU ("gfx10-1-generic", GK_GFX10_1_GENERIC, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP)
+AMDGCN_GPU ("gfx9-generic", GK_GFX9_GENERIC, ( 9, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES)
+AMDGCN_GPU ("gfx10-1-generic", GK_GFX10_1_GENERIC, (10, 1, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_WGP)
AMDGCN_GPU ("gfx10-3-generic", GK_GFX10_3_GENERIC, (10, 3, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx11-generic", GK_GFX11_GENERIC, (11, 0, 3), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
AMDGCN_GPU ("gfx12-generic", GK_GFX12_GENERIC, (12, 0, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP)
-AMDGCN_GPU ("gfx9-4-generic", GK_GFX9_4_GENERIC, ( 9, 4, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC)
-AMDGCN_GPU ("gfx12-5-generic", GK_GFX12_5_GENERIC, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS)
+AMDGCN_GPU ("gfx9-4-generic", GK_GFX9_4_GENERIC, ( 9, 4, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_XNACK_ON_OFF_MODES|FEATURE_SRAMECC)
+AMDGCN_GPU ("gfx12-5-generic", GK_GFX12_5_GENERIC, (12, 5, 0), FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK)
#undef AMDGCN_GPU
#undef AMDGCN_GPU_ALIAS
diff --git a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h
index 7c192b36b6ec8..1288f4cd69ff0 100644
--- a/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h
+++ b/llvm/include/llvm/TargetParser/AMDGPUTargetParser.h
@@ -72,8 +72,8 @@ enum ArchFeatureKind : uint32_t {
// WGP mode is supported.
FEATURE_WGP = 1 << 9,
- // Xnack is available by default
- FEATURE_XNACK_ALWAYS = 1 << 10
+ // Xnack on/off modes are supported.
+ FEATURE_XNACK_ON_OFF_MODES = 1 << 10
};
enum FeatureError : uint32_t {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 588f63aeffcb9..2abb9c0154947 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -207,6 +207,13 @@ def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
"Hardware supports XNACK"
>;
+defm XNACKOnOffModes : AMDGPUSubtargetFeature<"xnack-on-off-modes",
+ "Target supports XNACK on/off modes",
+ /*GenPredicate=*/1,
+ /*GenAssemblerPredicate=*/0,
+ [FeatureSupportsXNACK]
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -217,7 +224,8 @@ def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
def FeatureXNACK : SubtargetFeature<"xnack",
"EnableXNACK",
"true",
- "Enable XNACK support"
+ "Enable XNACK support",
+ [FeatureSupportsXNACK]
>;
def FeatureTgSplit : SubtargetFeature<"tgsplit",
@@ -1503,7 +1511,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
- FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
+ FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureXNACKOnOffModes,
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad,
@@ -1683,7 +1691,7 @@ def FeatureISAVersion8_0_1 : FeatureSet<
!listconcat(FeatureISAVersion8_0_Common.Features,
[FeatureFastFMAF32,
FeatureHalfRate64Ops,
- FeatureSupportsXNACK])>;
+ FeatureXNACKOnOffModes])>;
def FeatureISAVersion8_0_2 : FeatureSet<
!listconcat(FeatureISAVersion8_0_Common.Features,
@@ -1700,7 +1708,7 @@ def FeatureISAVersion8_0_5 : FeatureSet<
def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
- FeatureSupportsXNACK,
+ FeatureXNACKOnOffModes,
FeatureImageStoreD16Bug,
FeatureImageGather4D16Bug]>;
@@ -1895,7 +1903,7 @@ def FeatureISAVersion10_1_Common : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLDSMisalignedBug,
- FeatureSupportsXNACK,
+ FeatureXNACKOnOffModes,
// gfx101x bugs
FeatureVcmpxPermlaneHazard,
FeatureVMEMtoScalarWriteHazard,
@@ -2201,7 +2209,6 @@ def FeatureISAVersion12_50_Common : FeatureSet<
FeatureSetPrioIncWgInst,
FeatureSWakeupBarrier,
Feature45BitNumRecordsBufferResource,
- FeatureSupportsXNACK,
FeatureXNACK,
FeatureClusters,
FeatureD16Writes32BitVgpr,
@@ -2268,6 +2275,7 @@ def FeatureISAVersion12_5_Generic: FeatureSet<
[FeatureAddressableLocalMemorySize327680,
FeatureSetregVGPRMSBFixup,
FeatureRequiresCOV6,
+ FeatureSupportsXNACK,
FeatureGFX125xLowestRateWMMA,
FeatureTransCoexecutionHazard,
FeatureWMMACoexecutionHazards])>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ce7e22436f33f..03a046bcb9142 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -293,6 +293,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
// Property of the kernel/environment which can't actually differ.
AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
+ AMDGPU::FeatureXNACKOnOffModes, AMDGPU::FeatureSupportsXNACK,
AMDGPU::FeatureTrapHandler,
// The default assumption needs to be ecc is enabled, but no directly
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 14de6753d42e4..55edfc2ea52d2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -157,8 +157,6 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
"InstCacheLineSize must be a power of 2");
- TargetID.setTargetIDFromFeaturesString(FS);
-
LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
<< TargetID.getXnackSetting() << '\n');
LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
@@ -182,7 +180,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
: // clang-format off
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
- TargetID(*this),
+ TargetID(*this, FS),
InstrItins(getInstrItineraryForCPU(GPU)),
BufferOOBRelaxed(BufferOOBRelaxed),
TBufferOOBRelaxed(TBufferOOBRelaxed),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index ca1fe3ccf3da1..dc9636c6c2105 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -139,15 +139,9 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
return TargetID;
}
- void initializeTargetID(const MCSubtargetInfo &STI) {
- assert(TargetID == std::nullopt && "TargetID can only be initialized once");
- TargetID.emplace(STI);
- }
void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
- initializeTargetID(STI);
-
- assert(getTargetID() != std::nullopt && "TargetID is None");
- getTargetID()->setTargetIDFromFeaturesString(FeatureString);
+ assert(TargetID == std::nullopt && "TargetID can only be initialized once");
+ TargetID.emplace(STI, FeatureString);
}
};
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index cfa9a59d3ded2..e1e83ece32ad0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1099,20 +1099,19 @@ VOPD::InstInfo getVOPDInstInfo(unsigned VOPDOpcode,
namespace IsaInfo {
-AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
- : STI(STI), XnackSetting(TargetIDSetting::Any),
- SramEccSetting(TargetIDSetting::Any) {
- if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
- XnackSetting = TargetIDSetting::Unsupported;
- if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
- SramEccSetting = TargetIDSetting::Unsupported;
-}
+AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI,
+ StringRef FeatureString)
+ : STI(STI), XnackSetting(STI.getFeatureBits().test(FeatureSupportsXNACK)
+ ? TargetIDSetting::Any
+ : TargetIDSetting::Unsupported),
+ SramEccSetting(STI.getFeatureBits().test(FeatureSupportsSRAMECC)
+ ? TargetIDSetting::Any
+ : TargetIDSetting::Unsupported) {
-void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
// Check if xnack or sramecc is explicitly enabled or disabled. In the
// absence of the target features we assume we must generate code that can run
// in any environment.
- SubtargetFeatures Features(FS);
+ SubtargetFeatures Features(FeatureString);
std::optional<bool> XnackRequested;
std::optional<bool> SramEccRequested;
@@ -1127,7 +1126,10 @@ void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
SramEccRequested = false;
}
- bool XnackSupported = isXnackSupported();
+ // Only allow changing xnack setting if the target supports on/off modes.
+ // Targets without on/off mode support keep their initial setting (Any).
+
+ bool XnackSupported = STI.getFeatureBits().test(FeatureXNACKOnOffModes);
bool SramEccSupported = isSramEccSupported();
if (XnackRequested) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 11c393a623d20..6c771b3460662 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -160,7 +160,7 @@ class AMDGPUTargetID {
TargetIDSetting SramEccSetting;
public:
- explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
+ explicit AMDGPUTargetID(const MCSubtargetInfo &STI, StringRef FeatureString);
~AMDGPUTargetID() = default;
/// \return True if the current xnack setting is not "Unsupported".
@@ -217,7 +217,6 @@ class AMDGPUTargetID {
SramEccSetting = NewSramEccSetting;
}
- void setTargetIDFromFeaturesString(StringRef FS);
void setTargetIDFromTargetIDStream(StringRef TargetID);
/// Write string representation to \p OS
diff --git a/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll b/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll
new file mode 100644
index 0000000000000..13d13c875b8aa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/target-id-xnack-always-on.ll
@@ -0,0 +1,22 @@
+; gfx1250, gfx1251, and gfx12-5-generic have xnack always on because they don't
+; support on/off modes (no FeatureXNACKOnOffModes). The target ID should not
+; include xnack modifiers regardless of -mattr settings.
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic < %s | FileCheck --check-prefix=CHECK %s
+
+; Even with -mattr=+xnack or -mattr=-xnack, the target ID doesn't change
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 -mattr=+xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1251 -mattr=-xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic -mattr=+xnack < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-5-generic -mattr=-xnack < %s | FileCheck %s
+
+; CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx{{1250|1251|12-5-generic}}"
+
+define void @func0() {
+entry:
+ ret void
+}
>From b65d5100d712e49abe85b337b03663ec1dcbd266 Mon Sep 17 00:00:00 2001
From: Kevin Sala Penades <salapenades1 at llnl.gov>
Date: Thu, 18 Jun 2026 23:43:38 -0700
Subject: [PATCH 07/10] [offload][OpenMP] Fix record replay when no memory is
used (#201771)
Progams that do not use any memory (e.g., no mappings) were failing
because we were trying to execute zero size transfers. This commit adds
handling for this case.
---
offload/libomptarget/omptarget.cpp | 30 +++++++++------
.../common/src/RecordReplay.cpp | 37 +++++++++----------
.../record-replay-empty-memory.cpp | 26 +++++++++++++
.../kernelreplay/llvm-omp-kernel-replay.cpp | 3 +-
4 files changed, 64 insertions(+), 32 deletions(-)
create mode 100644 offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index d18b8e38b7808..84b7554253d20 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2440,6 +2440,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
// Initialize the device memory of each global.
for (int32_t I = 0; I < NumGlobals; ++I) {
assert(Globals[I].AuxAddr && "Global has no AuxAddr.");
+ assert(Globals[I].Size && "Global has Size zero.");
// Initialize the value of the global in the device.
int Ret = Device.submitData(Symbols[I + 1].DevPtr, Globals[I].AuxAddr,
@@ -2450,25 +2451,30 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
}
}
- // Reuse a previous device allocation or allocate a new device buffer.
+ // Reuse a previous device allocation or allocate a new device buffer. Do not
+ // allocate anything if the size is zero.
void *&TgtPtr = ReuseDeviceAlloc;
- if (!TgtPtr)
+ if (!TgtPtr && DeviceMemorySize) {
TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
TARGET_ALLOC_DEFAULT);
- if (!TgtPtr) {
- REPORT() << "Failed to allocate device memory.";
- return OFFLOAD_FAIL;
+ if (!TgtPtr) {
+ REPORT() << "Failed to allocate device memory.";
+ return OFFLOAD_FAIL;
+ }
}
// Save the device allocation for future replays of the same kernel.
if (ReplayOutcome)
ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
- int Ret =
- Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
- if (Ret != OFFLOAD_SUCCESS) {
- REPORT() << "Failed to submit data to a global.";
- return OFFLOAD_FAIL;
+ // Initialize the device memory.
+ if (DeviceMemorySize) {
+ int Ret =
+ Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
+ if (Ret != OFFLOAD_SUCCESS) {
+ REPORT() << "Failed to submit data to the device memory.";
+ return OFFLOAD_FAIL;
+ }
}
KernelArgsTy KernelArgs{};
@@ -2487,8 +2493,8 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelExtraArgsTy KernelExtraArgs{};
KernelExtraArgs.ReplayOutcome = ReplayOutcome;
- Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs,
- &KernelExtraArgs, AsyncInfo);
+ int Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets,
+ KernelArgs, &KernelExtraArgs, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT() << "Failed to launch kernel replay.";
return OFFLOAD_FAIL;
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index 7cfd39288307b..bd93f79bb8bad 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -338,23 +338,24 @@ Error NativeRecordReplayTy::recordSnapshot(StringRef Filename) {
uint64_t RecordSize = CurrentSize;
AllocationLock.unlock();
- ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
- WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
- if (!DeviceMemoryMB)
- return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
- "creating MemoryBuffer for device memory");
-
- if (auto Err = Device.dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
- StartAddr, RecordSize, nullptr))
- return Err;
-
- StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), RecordSize);
+ std::unique_ptr<WritableMemoryBuffer> DeviceMB;
+ if (RecordSize) {
+ DeviceMB = WritableMemoryBuffer::getNewUninitMemBuffer(RecordSize);
+ if (!DeviceMB)
+ return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
+ "creating MemoryBuffer for device memory");
+
+ if (auto Err = Device.dataRetrieve(DeviceMB->getBufferStart(), StartAddr,
+ RecordSize, nullptr))
+ return Err;
+ }
std::error_code EC;
raw_fd_ostream OS(Filename, EC);
if (EC)
return Plugin::error(ErrorCode::HOST_IO, "saving memory snapshot file");
- OS << DeviceMemory;
+ if (DeviceMB)
+ OS.write(DeviceMB->getBufferStart(), RecordSize);
OS.close();
return Plugin::success();
}
@@ -389,13 +390,12 @@ Error NativeRecordReplayTy::recordGlobals(StringRef Filename) {
NumGlobals++;
}
- ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
- WritableMemoryBuffer::getNewUninitMemBuffer(TotalSize);
+ auto GlobalsMB = WritableMemoryBuffer::getNewUninitMemBuffer(TotalSize);
if (!GlobalsMB)
return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
"creating MemoryBuffer for globals memory");
- void *BufferPtr = GlobalsMB.get()->getBufferStart();
+ void *BufferPtr = GlobalsMB->getBufferStart();
*((uint32_t *)(BufferPtr)) = NumGlobals;
BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
@@ -418,16 +418,15 @@ Error NativeRecordReplayTy::recordGlobals(StringRef Filename) {
return Err;
BufferPtr = utils::advancePtr(BufferPtr, Global.Size);
}
- assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
+ assert(BufferPtr == GlobalsMB->getBufferEnd() &&
"Buffer over or under-filled.");
assert(TotalSize == (uint64_t)utils::getPtrDiff(
- BufferPtr, GlobalsMB->get()->getBufferStart()) &&
+ BufferPtr, GlobalsMB->getBufferStart()) &&
"Buffer size mismatch.");
- StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), TotalSize);
std::error_code EC;
raw_fd_ostream OS(Filename, EC);
- OS << GlobalsMemory;
+ OS.write(GlobalsMB->getBufferStart(), TotalSize);
OS.close();
return Plugin::success();
}
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
new file mode 100644
index 0000000000000..0705c6d66ac8e
--- /dev/null
+++ b/offload/test/tools/omp-kernel-replay/record-replay-empty-memory.cpp
@@ -0,0 +1,26 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
+// clang-format on
+
+// REQUIRES: gpu
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: x86_64-unknown-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: intelgpu
+
+#include <cstdint>
+#include <cstdio>
+
+int main() {
+#pragma omp target teams num_teams(256)
+ {
+ }
+
+ // CHECK: PASS
+ printf("PASS\n");
+}
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index a5bda7a0f0444..4335002fd8c77 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -130,7 +130,8 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
if (!ReplayOutputBufferOrErr)
return createErr("failed to read the kernel replay output file");
- // Compare record and replay outputs to verify they match.
+ // Compare record and replay outputs to verify they match. If they are both
+ // empty, the verification is successful.
StringRef RecordOutput = RecordOutputBufferOrErr.get()->getBuffer();
StringRef ReplayOutput = ReplayOutputBufferOrErr.get()->getBuffer();
if (RecordOutput != ReplayOutput)
>From 9c50867e78707c7ad9b46b6c2c71ef45ac124bbb Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames at gmail.com>
Date: Fri, 19 Jun 2026 16:45:03 +1000
Subject: [PATCH 08/10] [ORC][examples] Add a new example showing basic
symbolAliases usage. (#204733)
LLJITWithSymbolAliases shows how the symbolAliases function can be used
to introduce aliases for both JIT'd and precompiled symbols.
---
llvm/examples/OrcV2Examples/CMakeLists.txt | 1 +
.../LLJITWithSymbolAliases/CMakeLists.txt | 12 +++
.../LLJITWithSymbolAliases.cpp | 85 +++++++++++++++++++
3 files changed, 98 insertions(+)
create mode 100644 llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt
create mode 100644 llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp
diff --git a/llvm/examples/OrcV2Examples/CMakeLists.txt b/llvm/examples/OrcV2Examples/CMakeLists.txt
index f1189e4ef96ca..e365565a6f9c2 100644
--- a/llvm/examples/OrcV2Examples/CMakeLists.txt
+++ b/llvm/examples/OrcV2Examples/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(LLJITWithLazyReexports)
add_subdirectory(LLJITWithObjectCache)
add_subdirectory(LLJITWithObjectLinkingLayerPlugin)
add_subdirectory(LLJITWithOptimizingIRTransform)
+add_subdirectory(LLJITWithSymbolAliases)
add_subdirectory(LLJITWithThinLTOSummaries)
add_subdirectory(OrcV2CBindingsAddObjectFile)
add_subdirectory(OrcV2CBindingsBasicUsage)
diff --git a/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt
new file mode 100644
index 0000000000000..d821eddf6560e
--- /dev/null
+++ b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+ Core
+ ExecutionEngine
+ IRReader
+ OrcJIT
+ Support
+ nativecodegen
+ )
+
+add_llvm_example(LLJITWithSymbolAliases
+ LLJITWithSymbolAliases.cpp
+ )
diff --git a/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp
new file mode 100644
index 0000000000000..50d9e58554b83
--- /dev/null
+++ b/llvm/examples/OrcV2Examples/LLJITWithSymbolAliases/LLJITWithSymbolAliases.cpp
@@ -0,0 +1,85 @@
+//===-- LLJITWithSymbolAliases.cpp - Symbol aliases with LLJIT ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This example demonstrates how to use the symbolAliases utility to define
+// alternate names for symbols already present in a JITDylib. We define two
+// aliases:
+//
+// - "aliased_foo" as an alias for "foo", a function defined in a JIT'd IR
+// module.
+// - "aliased_bar" as an alias for "bar", a precompiled function added to
+// the JITDylib via absoluteSymbols.
+//
+// We then look up both aliases and call them to confirm that they resolve to
+// the original definitions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "../ExampleModules.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+ExitOnError ExitOnErr;
+
+// IR module containing the simplest possible function: foo returns 42.
+const llvm::StringRef FooMod =
+ R"(
+ define i32 @foo() {
+ entry:
+ ret i32 42
+ }
+)";
+
+// Precompiled function that we will expose to the JIT via absoluteSymbols.
+static int bar() { return 7; }
+
+int main(int argc, char *argv[]) {
+ // Initialize LLVM.
+ InitLLVM X(argc, argv);
+
+ InitializeNativeTarget();
+ InitializeNativeTargetAsmPrinter();
+
+ cl::ParseCommandLineOptions(argc, argv, "LLJITWithSymbolAliases");
+ ExitOnErr.setBanner(std::string(argv[0]) + ": ");
+
+ // Create an LLJIT instance and add the IR module containing 'foo'.
+ auto J = ExitOnErr(LLJITBuilder().create());
+ ExitOnErr(J->addIRModule(ExitOnErr(parseExampleModule(FooMod, "foo-mod"))));
+
+ // Add the precompiled 'bar' function as an absolute symbol.
+ auto &JD = J->getMainJITDylib();
+ ExitOnErr(JD.define(absoluteSymbols(
+ {{J->mangleAndIntern("bar"),
+ {ExecutorAddr::fromPtr(&bar),
+ JITSymbolFlags::Exported | JITSymbolFlags::Callable}}})));
+
+ // Define aliases: 'aliased_foo' -> 'foo' and 'aliased_bar' -> 'bar'.
+ ExitOnErr(JD.define(symbolAliases(
+ {{J->mangleAndIntern("aliased_foo"),
+ {J->mangleAndIntern("foo"),
+ JITSymbolFlags::Exported | JITSymbolFlags::Callable}},
+ {J->mangleAndIntern("aliased_bar"),
+ {J->mangleAndIntern("bar"),
+ JITSymbolFlags::Exported | JITSymbolFlags::Callable}}})));
+
+ // Look up the aliases and call them.
+ auto AliasedFoo = ExitOnErr(J->lookup("aliased_foo")).toPtr<int()>();
+ auto AliasedBar = ExitOnErr(J->lookup("aliased_bar")).toPtr<int()>();
+
+ outs() << "aliased_foo() = " << AliasedFoo() << "\n"
+ << "aliased_bar() = " << AliasedBar() << "\n";
+
+ return 0;
+}
>From 8b329fb9ceaa9e44f6cea375d1d80c8884d55bee Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 30 May 2026 23:45:16 -0700
Subject: [PATCH 09/10] [offload] Fix teams/threads limits in record replay
---
.../common/include/PluginInterface.h | 3 ++
.../common/src/RecordReplay.cpp | 18 ++++++---
.../record-replay-diff-teams-threads.cpp | 37 ++++++++++++++++---
.../record-replay-diff-threads.cpp | 13 ++++---
.../kernelreplay/llvm-omp-kernel-replay.cpp | 17 ++++++++-
5 files changed, 69 insertions(+), 19 deletions(-)
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index cd7e1981435ea..dad061ae3c238 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -449,6 +449,9 @@ struct GenericKernelTy {
/// Get the size of the static per-block memory consumed by the kernel.
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+ /// Get the maximum number of threads per block that this kernel may use.
+ uint32_t getMaxThreads() const { return MaxNumThreads; }
+
/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index bd93f79bb8bad..ca6c5e7d98e45 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -270,18 +270,24 @@ Error NativeRecordReplayTy::recordDescImpl(
JsonKernelInfo["VAllocAddr"] = (intptr_t)StartAddr;
JsonKernelInfo["VAllocSize"] = TotalSize;
- // Add minimum and maximum for allowed number of teams. If zero, it means
+ // Export minimum and maximum for allowed number of teams. If zero, it means
// there was no restriction provided by the program.
+ uint32_t MinMaxBlocks = std::max(KernelArgs.UserNumBlocks[0], uint32_t(0));
json::Array JsonTeamsLimits;
- JsonTeamsLimits.push_back(KernelArgs.UserNumBlocks[0]);
- JsonTeamsLimits.push_back(KernelArgs.UserNumBlocks[0]);
+ JsonTeamsLimits.push_back(MinMaxBlocks);
+ JsonTeamsLimits.push_back(MinMaxBlocks);
JsonKernelInfo["TeamsLimits"] = json::Value(std::move(JsonTeamsLimits));
- // Add minimum and maximum for allowed number of threads. If zero, it means
+ // Export minimum and maximum for allowed number of threads. If zero, it means
// there was no restriction provided by the program.
+ uint32_t UserThreads = std::max(KernelArgs.UserThreadLimit[0], uint32_t(0));
+ uint32_t MaxThreads = UserThreads
+ ? std::min(UserThreads, Kernel.getMaxThreads())
+ : Kernel.getMaxThreads();
+ assert(MaxThreads >= 0 && "MaxThreads must be greater than zero.");
json::Array JsonThreadsLimits;
- JsonThreadsLimits.push_back(uint32_t(KernelArgs.UserThreadLimit[0] > 0));
- JsonThreadsLimits.push_back(KernelArgs.UserThreadLimit[0]);
+ JsonThreadsLimits.push_back(1);
+ JsonThreadsLimits.push_back(MaxThreads);
JsonKernelInfo["ThreadsLimits"] = json::Value(std::move(JsonThreadsLimits));
json::Array JsonArgPtrs;
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
index 803b633315527..e02d6a4d79efa 100644
--- a/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
@@ -2,12 +2,29 @@
// RUN: %libomptarget-compilexx-generic
// RUN: rm -rf %t.testdir
// RUN: mkdir -p %t.testdir
-// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=1 --num-threads=1 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=2 --num-threads=32 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=32 --num-threads=64 {}
-// clang-format on
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 2
+// RUN: ls -1 %t.testdir/*.json | count 2
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=1 --num-threads=1 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=2 --num-threads=32 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-teams=32 --num-threads=64 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=129 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=127 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=1024 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-teams=2 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR2 %s
+
+// RUN: %libomptarget-compilexx-generic -mllvm -openmp-ir-builder-use-default-max-threads=0
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 2
+// RUN: ls -1 %t.testdir/*.json | count 2
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=129 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=1024 %t.testdir/{}
+// RUN: sed -n '2p' %t.testdir/json_list.txt | xargs -I {} not %omp-kernel-replay --verify --num-threads=2048 %t.testdir/{} 2>&1 | FileCheck --check-prefix=REPLAY-ERROR1 %s
// REQUIRES: gpu
@@ -16,6 +33,10 @@
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: intelgpu
+// REPLAY-ERROR1: [llvm-omp-kernel-replay] Error: number of threads ({{[0-9]+}}) is out of the allowed limits (min,max: 1,{{[0-9]+}})
+// REPLAY-ERROR2: [llvm-omp-kernel-replay] Error: number of teams (2) is out of the allowed limits (min,max: 1,1)
+// clang-format on
+
#include <cstdint>
#include <cstdio>
@@ -33,6 +54,10 @@ int main() {
Data[I] = 10 + (uint64_t)I;
}
+#pragma omp target
+ {
+ }
+
uint64_t Sum = 0;
for (size_t I = 0; I < Size; ++I) {
Sum += Data[I];
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
index 9b65c38f98390..51333f1095124 100644
--- a/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
@@ -2,11 +2,14 @@
// RUN: %libomptarget-compilexx-generic
// RUN: rm -rf %t.testdir
// RUN: mkdir -p %t.testdir
-// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=1 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=32 {}
-// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=64 {}
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir LIBOMPTARGET_RECORD_REPORT_FILE=report.txt %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: awk '/\.json/ {print $1}' %t.testdir/report.txt | tr -d ',' > %t.testdir/json_list.txt
+// RUN: cat %t.testdir/json_list.txt | count 1
+// RUN: ls -1 %t.testdir/*.json | count 1
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=1 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=32 %t.testdir/{}
+// RUN: sed -n '1p' %t.testdir/json_list.txt | xargs -I {} %omp-kernel-replay --verify --num-threads=64 %t.testdir/{}
// clang-format on
// REQUIRES: gpu
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 4335002fd8c77..44fea1d79cb8c 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -209,16 +209,29 @@ Error replayKernel() {
if (Err)
return Err;
+ // Check that a minimum and maximum have been exported.
if (TeamsLimits.size() != 2 || ThreadsLimits.size() != 2)
return createErr("TeamsLimits and ThreadsLimits must have a min and max");
+ // Check that the minimum and maximum are specified or both are zero.
+ if (bool(TeamsLimits[0]) != bool(TeamsLimits[1]))
+ return createErr("TeamsLimits min and max are inconsistent");
+ if (bool(ThreadsLimits[0]) != bool(ThreadsLimits[1]))
+ return createErr("ThreadsLimits min and max are inconsistent");
+
// If the limits were specified, verify the selected values are valid.
if (TeamsLimits[0] > 0 &&
(NumTeams < TeamsLimits[0] || NumTeams > TeamsLimits[1]))
- return createErr("number of teams is out of the allowed limits");
+ return createErr("number of teams (%" PRIu32
+ ") is out of the allowed limits (min,max: %" PRIu32
+ ",%" PRIu32 ")",
+ NumTeams, TeamsLimits[0], TeamsLimits[1]);
if (ThreadsLimits[0] > 0 &&
(NumThreads < ThreadsLimits[0] || NumThreads > ThreadsLimits[1]))
- return createErr("number of threads is out of the allowed limits");
+ return createErr("number of threads (%" PRIu32
+ ") is out of the allowed limits (min,max: %" PRIu32
+ ",%" PRIu32 ")",
+ NumThreads, ThreadsLimits[0], ThreadsLimits[1]);
// Retrieve the arguments of the kernel.
SmallVector<void *> TgtArgs;
>From a31c7c8e40de88cdb4d656d223c27fedb1382901 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 18 Jun 2026 00:31:41 -0700
Subject: [PATCH 10/10] [offload] Add flag to ignore limits in kernel replay
---
offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 44fea1d79cb8c..353bd9d07696d 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -63,6 +63,11 @@ static cl::opt<uint32_t>
cl::desc("Set the number of replay repetitions."),
cl::init(1), cl::cat(ReplayOptions));
+static cl::opt<bool>
+ IgnoreLimitsOpt("ignore-limits",
+ cl::desc("Ignore thread and team limits (unrecommended)."),
+ cl::init(false), cl::cat(ReplayOptions));
+
template <typename... ArgsTy>
Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
@@ -220,13 +225,13 @@ Error replayKernel() {
return createErr("ThreadsLimits min and max are inconsistent");
// If the limits were specified, verify the selected values are valid.
- if (TeamsLimits[0] > 0 &&
+ if (!IgnoreLimitsOpt && TeamsLimits[0] > 0 &&
(NumTeams < TeamsLimits[0] || NumTeams > TeamsLimits[1]))
return createErr("number of teams (%" PRIu32
") is out of the allowed limits (min,max: %" PRIu32
",%" PRIu32 ")",
NumTeams, TeamsLimits[0], TeamsLimits[1]);
- if (ThreadsLimits[0] > 0 &&
+ if (!IgnoreLimitsOpt && ThreadsLimits[0] > 0 &&
(NumThreads < ThreadsLimits[0] || NumThreads > ThreadsLimits[1]))
return createErr("number of threads (%" PRIu32
") is out of the allowed limits (min,max: %" PRIu32
More information about the llvm-branch-commits
mailing list