[clang] [llvm] [HLSL][SPRIV][DXIL] Implement InterlockedOr builtin (PR #204923)
Sietze Riemersma via cfe-commits
cfe-commits at lists.llvm.org
Sat Jun 20 05:03:41 PDT 2026
https://github.com/KungFuDonkey created https://github.com/llvm/llvm-project/pull/204923
This PR adds the InterlockedOr function to HLSL. A similar PR from last year was made for this #180804 but was never merged. So I reimplemented as InterlockedAdd went in recently, which made this change easy enough for me to do.
Added some reusability on the for future interlocked functions
>From 9a4648ee49b5b0a7b255b354adf360d9f81bf530 Mon Sep 17 00:00:00 2001
From: KungFuDonkey <sietze.riemersma at gmail.com>
Date: Sat, 20 Jun 2026 13:53:41 +0200
Subject: [PATCH] [HLSL][SPRIV][DXIL] Implement InterlockedOr builtin
---
clang/include/clang/Basic/Builtins.td | 6 ++
clang/lib/CodeGen/CGHLSLBuiltins.cpp | 64 ++++++-----
clang/lib/CodeGen/CGHLSLRuntime.h | 1 +
clang/lib/Sema/HLSLExternalSemaSource.cpp | 18 ++++
clang/lib/Sema/SemaHLSL.cpp | 7 +-
.../CodeGenHLSL/builtins/InterlockedOr.hlsl | 59 +++++++++++
.../BuiltIns/InterlockedOr-errors.hlsl | 100 ++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 +
llvm/include/llvm/IR/IntrinsicsSPIRV.td | 4 +
.../Target/DirectX/DXILIntrinsicExpansion.cpp | 13 ++-
.../Target/SPIRV/SPIRVInstructionSelector.cpp | 19 ++--
llvm/test/CodeGen/DirectX/InterlockedOr.ll | 52 +++++++++
.../SPIRV/hlsl-intrinsics/InterlockedOr.ll | 36 +++++++
.../hlsl-intrinsics/InterlockedOr_spv_i64.ll | 37 +++++++
14 files changed, 378 insertions(+), 42 deletions(-)
create mode 100644 clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl
create mode 100644 clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl
create mode 100644 llvm/test/CodeGen/DirectX/InterlockedOr.ll
create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll
create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 053a257ba6d4a..61e63c4d9b073 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5465,6 +5465,12 @@ def HLSLInterlockedAdd : LangBuiltin<"HLSL_LANG"> {
let Prototype = "void (...)";
}
+def HLSLInterlockedOr : LangBuiltin<"HLSL_LANG"> {
+ let Spellings = ["__builtin_hlsl_interlocked_or"];
+ let Attributes = [NoThrow];
+ let Prototype = "void (...)";
+}
+
def HLSLWaveActiveBallot : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_wave_active_ballot"];
let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 20a2119e28ce1..5f184dbb91068 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -172,6 +172,35 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
return LastInst;
}
+// Emit an HLSL Interlocked* atomic operation. All Interlocked* builtins share
+// the same shape, differing only in the target intrinsic:
+// void Interlocked<Op>(groupshared|device T &dest, T value);
+// void Interlocked<Op>(groupshared|device T &dest, T value,
+// T &original_value);
+// Both `dest` and `original_value` are plain references, so we can use the
+// underlying lvalue directly without HLSLOutArgExpr unwrapping.
+static Value *handleHlslInterlocked(CodeGenFunction &CGF, const CallExpr *E,
+ Intrinsic::ID ID, const Twine &Name) {
+ LValue DestLV = CGF.EmitLValue(E->getArg(0));
+ Value *Ptr = DestLV.getAddress().emitRawPointer(CGF);
+ Value *Val = CGF.EmitScalarExpr(E->getArg(1));
+ assert(E->getArg(1)->getType()->isIntegerType() &&
+ "Intrinsic Interlocked value operand must be an integer");
+
+ Value *Call = CGF.EmitRuntimeCall(
+ Intrinsic::getOrInsertDeclaration(&CGF.CGM.getModule(), ID,
+ {Val->getType(), Ptr->getType()}),
+ ArrayRef<Value *>{Ptr, Val}, Name);
+
+ // The 3-arg overload writes the old value (the intrinsic's return value)
+ // into the `original_value` reference parameter.
+ if (E->getNumArgs() == 3) {
+ LValue OrigLV = CGF.EmitLValue(E->getArg(2));
+ CGF.EmitStoreThroughLValue(RValue::get(Call), OrigLV);
+ }
+ return Call;
+}
+
static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF,
const CallExpr *E) {
Value *Cond = CGF.EmitScalarExpr(E->getArg(0));
@@ -1427,33 +1456,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op},
"hlsl.wave.active.bit.and");
}
- case Builtin::BI__builtin_hlsl_interlocked_add: {
- // HLSL signatures (synthesized as overloads in HLSLExternalSemaSource):
- // void InterlockedAdd(groupshared|device T &dest, T value);
- // void InterlockedAdd(groupshared|device T &dest, T value,
- // T &original_value);
- // Both `dest` and `original_value` are plain references, so we can use
- // the underlying lvalue directly without HLSLOutArgExpr unwrapping.
- LValue DestLV = EmitLValue(E->getArg(0));
- Value *Ptr = DestLV.getAddress().emitRawPointer(*this);
- Value *Val = EmitScalarExpr(E->getArg(1));
- assert(E->getArg(1)->getType()->isIntegerType() &&
- "Intrinsic InterlockedAdd value operand must be an integer");
-
- Intrinsic::ID ID = CGM.getHLSLRuntime().getInterlockedAddIntrinsic();
- Value *Call = EmitRuntimeCall(
- Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID,
- {Val->getType(), Ptr->getType()}),
- ArrayRef<Value *>{Ptr, Val}, "hlsl.interlocked.add");
-
- // The 3-arg overload writes the old value (the intrinsic's return value)
- // into the `original_value` reference parameter.
- if (E->getNumArgs() == 3) {
- LValue OrigLV = EmitLValue(E->getArg(2));
- EmitStoreThroughLValue(RValue::get(Call), OrigLV);
- }
- return Call;
- }
+ case Builtin::BI__builtin_hlsl_interlocked_add:
+ return handleHlslInterlocked(
+ *this, E, CGM.getHLSLRuntime().getInterlockedAddIntrinsic(),
+ "hlsl.interlocked.add");
+ case Builtin::BI__builtin_hlsl_interlocked_or:
+ return handleHlslInterlocked(
+ *this, E, CGM.getHLSLRuntime().getInterlockedOrIntrinsic(),
+ "hlsl.interlocked.or");
case Builtin::BI__builtin_hlsl_wave_active_ballot: {
[[maybe_unused]] Value *Op = EmitScalarExpr(E->getArg(0));
assert(Op->getType()->isIntegerTy(1) &&
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index a126d4612a5f4..154d19ff7bd25 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -152,6 +152,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveBitXor, wave_reduce_xor)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveBitAnd, wave_reduce_and)
GENERATE_HLSL_INTRINSIC_FUNCTION(InterlockedAdd, interlocked_add)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(InterlockedOr, interlocked_or)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveMax, wave_reduce_max)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveUMax, wave_reduce_umax)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveMin, wave_reduce_min)
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 3f7255cb3f8a7..537357be5e2bd 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -760,8 +760,26 @@ static void defineHLSLInterlockedAdd(Sema &S, NamespaceDecl *NS) {
ThreeArg);
}
+// Synthesize the InterlockedOr overload set: {int, uint, int64_t, uint64_t}
+// x {groupshared, device} x {2-arg, 3-arg}.
+static void defineHLSLInterlockedOr(Sema &S, NamespaceDecl *NS) {
+ ASTContext &AST = S.getASTContext();
+ // HLSL: int64_t == long, uint64_t == unsigned long (see hlsl_basic_types.h).
+ QualType Elems[] = {AST.IntTy, AST.UnsignedIntTy, AST.LongTy,
+ AST.UnsignedLongTy};
+ LangAS AddrSpaces[] = {LangAS::hlsl_groupshared, LangAS::hlsl_device};
+
+ for (QualType ElemTy : Elems)
+ for (LangAS AS : AddrSpaces)
+ for (bool ThreeArg : {false, true})
+ buildAtomicOverload(S, NS, "InterlockedOr",
+ "__builtin_hlsl_interlocked_or", ElemTy, AS,
+ ThreeArg);
+}
+
void HLSLExternalSemaSource::defineHLSLAtomicIntrinsics() {
defineHLSLInterlockedAdd(*SemaPtr, HLSLNamespace);
+ defineHLSLInterlockedOr(*SemaPtr, HLSLNamespace);
}
void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record,
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 075dc97b0aef2..e3d8e4ff22bcb 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -4534,10 +4534,11 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
TheCall->setType(ArgTyExpr);
break;
}
- case Builtin::BI__builtin_hlsl_interlocked_add: {
+ case Builtin::BI__builtin_hlsl_interlocked_add:
+ case Builtin::BI__builtin_hlsl_interlocked_or: {
// The builtin's prototype in Builtins.td is `void (...)`, so direct calls
- // to `__builtin_hlsl_interlocked_add` bypass argument checking entirely.
- // When reached via the synthesized `InterlockedAdd` overload set in
+ // to `__builtin_hlsl_interlocked_*` bypass argument checking entirely.
+ // When reached via the synthesized `Interlocked*` overload set in
// HLSLExternalSemaSource, overload resolution has already enforced the
// argument count, integer-type matching, and the address-space requirement
// on `dest`. The checks below are a safety net for callers that invoke the
diff --git a/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl b/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl
new file mode 100644
index 0000000000000..a4c4f4cc7dd6c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: dxil-pc-shadermodel6.6-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK
+
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK
+
+// Test basic lowering of HLSL InterlockedOr to the target intrinsic.
+
+groupshared int gs_i32;
+groupshared uint gs_u32;
+groupshared int64_t gs_i64;
+groupshared uint64_t gs_u64;
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int_2arg
+// DXCHECK: call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}})
+// SPVCHECK: call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}})
+export void test_int_2arg(int v) {
+ InterlockedOr(gs_i32, v);
+}
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint_2arg
+// DXCHECK: call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}})
+// SPVCHECK: call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}})
+export void test_uint_2arg(uint v) {
+ InterlockedOr(gs_u32, v);
+}
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int_3arg
+// DXCHECK: %[[R:.*]] = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}})
+// SPVCHECK: %[[R:.*]] = call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}})
+// CHECK: store i32 %[[R]], ptr {{.*}}
+export void test_int_3arg(int v, out int orig) {
+ InterlockedOr(gs_i32, v, orig);
+}
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint_3arg
+// DXCHECK: %[[R:.*]] = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}})
+// SPVCHECK: %[[R:.*]] = call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}})
+// CHECK: store i32 %[[R]], ptr {{.*}}
+export void test_uint_3arg(uint v, out uint orig) {
+ InterlockedOr(gs_u32, v, orig);
+}
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int64_2arg
+// DXCHECK: call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_i64{{.*}}, i64 %{{.*}})
+// SPVCHECK: call spir_func i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_i64{{.*}}, i64 %{{.*}})
+export void test_int64_2arg(int64_t v) {
+ InterlockedOr(gs_i64, v);
+}
+
+// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint64_3arg
+// DXCHECK: %[[R:.*]] = call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_u64{{.*}}, i64 %{{.*}})
+// SPVCHECK: %[[R:.*]] = call spir_func i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_u64{{.*}}, i64 %{{.*}})
+// CHECK: store i64 %[[R]], ptr {{.*}}
+export void test_uint64_3arg(uint64_t v, out uint64_t orig) {
+ InterlockedOr(gs_u64, v, orig);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl
new file mode 100644
index 0000000000000..faa2825139ad4
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header \
+// RUN: -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only \
+// RUN: -disable-llvm-passes -verify
+
+// InterlockedOr is provided as a set of address-space-qualified overloads
+// (groupshared/device, {int,uint,int64_t,uint64_t}, 2-arg/3-arg). All arg
+// mismatches surface as "no matching function" with 16 candidates. The
+// candidate notes come from synthesized FunctionDecls with no source
+// location, so they are matched with `@*:*`.
+
+groupshared int gs_i32;
+groupshared float gs_f32;
+struct S { int x; };
+groupshared S gs_s;
+
+void too_few(int v) {
+ InterlockedOr(gs_i32); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+void too_many(int v, int extra) {
+ int o;
+ InterlockedOr(gs_i32, v, o, extra); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+// Atomics must operate on actual addresses in groupshared or device memory;
+// passing a plain local (no address space) must not bind to any overload.
+void local_dest(int v) {
+ int dest;
+ InterlockedOr(dest, v); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+void float_dest(float v) {
+ InterlockedOr(gs_f32, v); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+void struct_dest(int v) {
+ InterlockedOr(gs_s, v); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+void mismatched_orig_type(int v) {
+ uint orig;
+ InterlockedOr(gs_i32, v, orig); // expected-error{{no matching function for call to 'InterlockedOr'}}
+ // expected-note@*:* 16 {{candidate function}}
+}
+
+// The tests below exercise direct invocations of the underlying clang builtin
+// `__builtin_hlsl_interlocked_or`. These bypass overload resolution against
+// the synthesized `InterlockedOr` overload set (the builtin's prototype in
+// Builtins.td is `void (...)`), so each error is produced by the explicit
+// checks in SemaHLSL.cpp rather than by candidate-set rejection.
+
+void direct_too_few() {
+ __builtin_hlsl_interlocked_or(gs_i32);
+ // expected-error at -1 {{too few arguments to function call, expected at least 2, have 1}}
+}
+
+void direct_too_many(int v, int extra) {
+ int o;
+ __builtin_hlsl_interlocked_or(gs_i32, v, o, extra);
+ // expected-error at -1 {{too many arguments to function call, expected at most 3, have 4}}
+}
+
+void direct_non_integer_dest() {
+ S local_s;
+ __builtin_hlsl_interlocked_or(local_s, 1);
+ // expected-error at -1 {{1st argument must be a scalar integer type (was 'S')}}
+}
+
+void direct_nonlvalue_dest(int v) {
+ __builtin_hlsl_interlocked_or(1, v);
+ // expected-error at -1 {{cannot bind non-lvalue argument '1' to out parameter}}
+}
+
+void direct_mismatched_value() {
+ uint uv = 1u;
+ __builtin_hlsl_interlocked_or(gs_i32, uv);
+ // expected-error at -1 {{passing 'uint' (aka 'unsigned int') to parameter of incompatible type 'int'}}
+}
+
+void direct_mismatched_orig(int v) {
+ uint orig;
+ __builtin_hlsl_interlocked_or(gs_i32, v, orig);
+ // expected-error at -1 {{passing 'uint' (aka 'unsigned int') to parameter of incompatible type 'int'}}
+}
+
+void direct_nonlvalue_orig(int v) {
+ __builtin_hlsl_interlocked_or(gs_i32, v, 1);
+ // expected-error at -1 {{cannot bind non-lvalue argument '1' to out parameter}}
+}
+
+void direct_default_as_dest(int v) {
+ int local;
+ __builtin_hlsl_interlocked_or(local, v);
+ // expected-error at -1 {{1st argument to atomic builtin must reference groupshared or device memory (was 'int')}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index af360dfc78965..d2db4905aeabe 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -261,6 +261,10 @@ def int_dx_interlocked_add :
DefaultAttrsIntrinsic<[llvm_anyint_ty],
[llvm_anyptr_ty, LLVMMatchType<0>],
[IntrArgMemOnly]>;
+def int_dx_interlocked_or :
+ DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [llvm_anyptr_ty, LLVMMatchType<0>],
+ [IntrArgMemOnly]>;
def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
def int_dx_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6e4cf8f7e72dc..5c59a32ddce99 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -152,6 +152,10 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
DefaultAttrsIntrinsic<[llvm_anyint_ty],
[llvm_anyptr_ty, LLVMMatchType<0>],
[IntrArgMemOnly]>;
+ def int_spv_interlocked_or :
+ DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [llvm_anyptr_ty, LLVMMatchType<0>],
+ [IntrArgMemOnly]>;
def int_spv_subgroup_ballot : ClangBuiltin<"__builtin_spirv_subgroup_ballot">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 88eda6656d89b..62fb8d1b12891 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -228,6 +228,7 @@ static bool isIntrinsicExpansion(Function &F) {
case Intrinsic::dx_step:
case Intrinsic::dx_radians:
case Intrinsic::dx_interlocked_add:
+ case Intrinsic::dx_interlocked_or:
case Intrinsic::usub_sat:
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_fadd:
@@ -771,15 +772,16 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) {
return Builder.CreateFMul(X, PiOver180);
}
-static Value *expandInterlockedAddIntrinsic(CallInst *Orig) {
- // Lower @llvm.dx.interlocked.add(ptr, val) to `atomicrmw add ptr, val
+static Value *expandInterlockedIntrinsic(CallInst *Orig,
+ AtomicRMWInst::BinOp Op) {
+ // Lower @llvm.dx.interlocked.<op>(ptr, val) to `atomicrmw <op> ptr, val
// monotonic`. HLSL Interlocked operations imply no fence/barrier, which maps
// to monotonic ordering. The instruction's result is the old value, matching
// the intrinsic's return value.
Value *Ptr = Orig->getArgOperand(0);
Value *Val = Orig->getArgOperand(1);
IRBuilder<> Builder(Orig);
- return Builder.CreateAtomicRMW(AtomicRMWInst::Add, Ptr, Val, MaybeAlign(),
+ return Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(),
AtomicOrdering::Monotonic);
}
@@ -1245,7 +1247,10 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
Result = expandRadiansIntrinsic(Orig);
break;
case Intrinsic::dx_interlocked_add:
- Result = expandInterlockedAddIntrinsic(Orig);
+ Result = expandInterlockedIntrinsic(Orig, AtomicRMWInst::Add);
+ break;
+ case Intrinsic::dx_interlocked_or:
+ Result = expandInterlockedIntrinsic(Orig, AtomicRMWInst::Or);
break;
case Intrinsic::dx_resource_load_rawbuffer:
if (expandBufferLoadIntrinsic(Orig, /*IsRaw*/ true))
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index cd99015a61ba9..2220fc72e3837 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -177,8 +177,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectAtomicRMW(Register ResVReg, SPIRVTypeInst ResType, MachineInstr &I,
unsigned NewOpcode, unsigned NegateOpcode = 0) const;
- bool selectInterlockedAdd(Register ResVReg, SPIRVTypeInst ResType,
- MachineInstr &I) const;
+ bool selectInterlocked(Register ResVReg, SPIRVTypeInst ResType,
+ MachineInstr &I, unsigned Opcode) const;
bool selectAtomicCmpXchg(Register ResVReg, SPIRVTypeInst ResType,
MachineInstr &I) const;
@@ -2445,16 +2445,17 @@ bool SPIRVInstructionSelector::selectAtomicRMW(Register ResVReg,
return true;
}
-bool SPIRVInstructionSelector::selectInterlockedAdd(Register ResVReg,
- SPIRVTypeInst ResType,
- MachineInstr &I) const {
+bool SPIRVInstructionSelector::selectInterlocked(Register ResVReg,
+ SPIRVTypeInst ResType,
+ MachineInstr &I,
+ unsigned Opcode) const {
Register Ptr = I.getOperand(2).getReg();
Register Value = I.getOperand(3).getReg();
SPIRV::StorageClass::StorageClass SC = GR.getPointerStorageClass(Ptr);
assert((SC == SPIRV::StorageClass::Workgroup ||
SC == SPIRV::StorageClass::StorageBuffer) &&
- "InterlockedAdd requires Workgroup or StorageBuffer storage class");
+ "Interlocked op requires Workgroup or StorageBuffer storage class");
uint32_t Scope = static_cast<uint32_t>(SC == SPIRV::StorageClass::Workgroup
? SPIRV::Scope::Workgroup
: SPIRV::Scope::Device);
@@ -2463,7 +2464,7 @@ bool SPIRVInstructionSelector::selectInterlockedAdd(Register ResVReg,
uint32_t MemSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC));
Register MemSemReg = buildI32Constant(MemSem, I);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpAtomicIAdd))
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
.addDef(ResVReg)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(Ptr)
@@ -5362,7 +5363,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectWaveReduceOp(ResVReg, ResType, I,
SPIRV::OpGroupNonUniformBitwiseAnd);
case Intrinsic::spv_interlocked_add:
- return selectInterlockedAdd(ResVReg, ResType, I);
+ return selectInterlocked(ResVReg, ResType, I, SPIRV::OpAtomicIAdd);
+ case Intrinsic::spv_interlocked_or:
+ return selectInterlocked(ResVReg, ResType, I, SPIRV::OpAtomicOr);
case Intrinsic::spv_wave_reduce_umax:
return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true);
case Intrinsic::spv_wave_reduce_max:
diff --git a/llvm/test/CodeGen/DirectX/InterlockedOr.ll b/llvm/test/CodeGen/DirectX/InterlockedOr.ll
new file mode 100644
index 0000000000000..34387adefeed3
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/InterlockedOr.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.6-compute %s | FileCheck %s
+
+; Verify llvm.dx.interlocked.or expands to atomicrmw or monotonic.
+
+; Groupshared (addrspace 3) memory tests.
+ at gs_i32 = internal addrspace(3) global i32 zeroinitializer
+ at gs_i64 = internal addrspace(3) global i64 zeroinitializer
+
+define i32 @test_i32(i32 %v) {
+entry:
+; CHECK-LABEL: @test_i32
+; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(3) @gs_i32, i32 %v monotonic
+; CHECK: ret i32 %[[R]]
+ %r = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) @gs_i32, i32 %v)
+ ret i32 %r
+}
+
+define i64 @test_i64(i64 %v) {
+entry:
+; CHECK-LABEL: @test_i64
+; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(3) @gs_i64, i64 %v monotonic
+; CHECK: ret i64 %[[R]]
+ %r = call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) @gs_i64, i64 %v)
+ ret i64 %r
+}
+
+; Device (addrspace 1) memory tests.
+ at dev_i32 = external addrspace(1) global i32
+ at dev_i64 = external addrspace(1) global i64
+
+define i32 @test_device_i32(i32 %v) {
+entry:
+; CHECK-LABEL: @test_device_i32
+; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(1) @dev_i32, i32 %v monotonic
+; CHECK: ret i32 %[[R]]
+ %r = call i32 @llvm.dx.interlocked.or.i32.p1(ptr addrspace(1) @dev_i32, i32 %v)
+ ret i32 %r
+}
+
+define i64 @test_device_i64(i64 %v) {
+entry:
+; CHECK-LABEL: @test_device_i64
+; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(1) @dev_i64, i64 %v monotonic
+; CHECK: ret i64 %[[R]]
+ %r = call i64 @llvm.dx.interlocked.or.i64.p1(ptr addrspace(1) @dev_i64, i64 %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3), i32)
+declare i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3), i64)
+declare i32 @llvm.dx.interlocked.or.i32.p1(ptr addrspace(1), i32)
+declare i64 @llvm.dx.interlocked.or.i64.p1(ptr addrspace(1), i64)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll
new file mode 100644
index 0000000000000..a8b3c692f9bc0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering of llvm.spv.interlocked.or to OpAtomicOr.
+
+; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#scope_wg:]] = OpConstant %[[#uint]] 2
+; CHECK-DAG: %[[#scope_dev:]] = OpConstant %[[#uint]] 1
+; CHECK-DAG: %[[#mem_wg:]] = OpConstant %[[#uint]] 256
+; CHECK-DAG: %[[#mem_uniform:]] = OpConstant %[[#uint]] 64
+
+ at gs_i32 = internal addrspace(3) global i32 zeroinitializer
+ at dev_i32 = external addrspace(11) global i32
+
+; Workgroup (addrspace 3) memory tests.
+
+; CHECK-LABEL: Begin function test_i32
+define i32 @test_i32(i32 %v) {
+entry:
+; CHECK: %[[#R:]] = OpAtomicOr %[[#uint]] %[[#]] %[[#scope_wg]] %[[#mem_wg]] %[[#]]
+ %r = call i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) @gs_i32, i32 %v)
+ ret i32 %r
+}
+
+; Device / StorageBuffer (addrspace 11) memory tests.
+
+; CHECK-LABEL: Begin function test_device_i32
+define i32 @test_device_i32(i32 %v) {
+entry:
+; CHECK: %[[#R:]] = OpAtomicOr %[[#uint]] %[[#]] %[[#scope_dev]] %[[#mem_uniform]] %[[#]]
+ %r = call i32 @llvm.spv.interlocked.or.i32.p11(ptr addrspace(11) @dev_i32, i32 %v)
+ ret i32 %r
+}
+
+declare i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3), i32)
+declare i32 @llvm.spv.interlocked.or.i32.p11(ptr addrspace(11), i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll
new file mode 100644
index 0000000000000..18a16229ee718
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering of llvm.spv.interlocked.or with i64 to OpAtomicOr.
+
+; CHECK-DAG: %[[#ulong:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#scope_wg:]] = OpConstant %[[#uint]] 2
+; CHECK-DAG: %[[#scope_dev:]] = OpConstant %[[#uint]] 1
+; CHECK-DAG: %[[#mem_wg:]] = OpConstant %[[#uint]] 256
+; CHECK-DAG: %[[#mem_uniform:]] = OpConstant %[[#uint]] 64
+
+ at gs_i64 = internal addrspace(3) global i64 zeroinitializer
+ at dev_i64 = external addrspace(11) global i64
+
+; Workgroup (addrspace 3) memory test.
+
+; CHECK-LABEL: Begin function test_i64
+define i64 @test_i64(i64 %v) {
+entry:
+; CHECK: %[[#R:]] = OpAtomicOr %[[#ulong]] %[[#]] %[[#scope_wg]] %[[#mem_wg]] %[[#]]
+ %r = call i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) @gs_i64, i64 %v)
+ ret i64 %r
+}
+
+; Device / StorageBuffer (addrspace 11) memory test.
+
+; CHECK-LABEL: Begin function test_device_i64
+define i64 @test_device_i64(i64 %v) {
+entry:
+; CHECK: %[[#R:]] = OpAtomicOr %[[#ulong]] %[[#]] %[[#scope_dev]] %[[#mem_uniform]] %[[#]]
+ %r = call i64 @llvm.spv.interlocked.or.i64.p11(ptr addrspace(11) @dev_i64, i64 %v)
+ ret i64 %r
+}
+
+declare i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3), i64)
+declare i64 @llvm.spv.interlocked.or.i64.p11(ptr addrspace(11), i64)
More information about the cfe-commits
mailing list