[clang] 298f1c2 - Revert "Add missing intrinsics to cuda headers" (#144755)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 18 10:08:31 PDT 2025
Author: Artem Belevich
Date: 2025-06-18T10:08:27-07:00
New Revision: 298f1c276f4f9c18b25a79ffe6e619e89c5fbf7e
URL: https://github.com/llvm/llvm-project/commit/298f1c276f4f9c18b25a79ffe6e619e89c5fbf7e
DIFF: https://github.com/llvm/llvm-project/commit/298f1c276f4f9c18b25a79ffe6e619e89c5fbf7e.diff
LOG: Revert "Add missing intrinsics to cuda headers" (#144755)
Reverts llvm/llvm-project#143664
as it breaks CUDA compilation.
Added:
Modified:
clang/lib/Headers/__clang_cuda_intrinsics.h
Removed:
################################################################################
diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 5e13f3f78df70..8b230af6f6647 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,290 +479,6 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
return ret;
}
-#pragma push_macro("__INTRINSIC_LOAD")
-#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
- __Clobber) \
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
- __TmpType __ret; \
- asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
- return (__DeclType)__ret; \
- }
-
-#pragma push_macro("__INTRINSIC_LOAD2")
-#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
- __Clobber) \
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
- __DeclType __ret; \
- __TmpType __tmp; \
- asm(__AsmOp " {%0,%1}, [%2];" \
- : __AsmType(__tmp.x), __AsmType(__tmp.y) \
- : "l"(__ptr)__Clobber); \
- using __ElementType = decltype(__ret.x); \
- __ret.x = (__ElementType)(__tmp.x); \
- __ret.y = (__ElementType)__tmp.y; \
- return __ret; \
- }
-
-#pragma push_macro("__INTRINSIC_LOAD4")
-#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
- __Clobber) \
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
- __DeclType __ret; \
- __TmpType __tmp; \
- asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
- : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
- __AsmType(__tmp.w) \
- : "l"(__ptr)__Clobber); \
- using __ElementType = decltype(__ret.x); \
- __ret.x = (__ElementType)__tmp.x; \
- __ret.y = (__ElementType)__tmp.y; \
- __ret.z = (__ElementType)__tmp.z; \
- __ret.w = (__ElementType)__tmp.w; \
- return __ret; \
- }
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
- "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
- "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
- "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
- "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
- unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
- "=l", );
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
-
-inline __device__ long __ldcg(const long *__ptr) {
- unsigned long __ret;
- if (sizeof(long) == 8) {
- asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
- } else {
- asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
- }
- return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
- "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
- "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
- "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
- unsigned long long, "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
- "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
- "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
- "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
- "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
- "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
- "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
- "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
- "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
- "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
- "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
- "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
- "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
- "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
- "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
- "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
- "=f", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
- "=f", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
- "=d", : "memory");
-
-inline __device__ long __ldcv(const long *__ptr) {
- unsigned long __ret;
- if (sizeof(long) == 8) {
- asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
- } else {
- asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
- }
- return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
- "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
- "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
- "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
- "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
- unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
- "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
-
-#pragma pop_macro("__INTRINSIC_LOAD")
-#pragma pop_macro("__INTRINSIC_LOAD2")
-#pragma pop_macro("__INTRINSIC_LOAD4")
-
-inline __device__ long __ldcs(const long *__ptr) {
- unsigned long __ret;
- if (sizeof(long) == 8) {
- asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
- } else {
- asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
- }
- return (long)__ret;
-}
-
-#pragma push_macro("__INTRINSIC_STORE")
-#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
- __TmpType __tmp = (__TmpType)__value; \
- asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
- }
-
-#pragma push_macro("__INTRINSIC_STORE2")
-#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType, \
- __AsmType) \
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
- __TmpType __tmp; \
- using __ElementType = decltype(__tmp.x); \
- __tmp.x = (__ElementType)(__value.x); \
- __tmp.y = (__ElementType)(__value.y); \
- asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
- __AsmType(__tmp.y) \
- : "memory"); \
- }
-
-#pragma push_macro("__INTRINSIC_STORE4")
-#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType, \
- __AsmType) \
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
- __TmpType __tmp; \
- using __ElementType = decltype(__tmp.x); \
- __tmp.x = (__ElementType)(__value.x); \
- __tmp.y = (__ElementType)(__value.y); \
- __tmp.z = (__ElementType)(__value.z); \
- __tmp.w = (__ElementType)(__value.w); \
- asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
- __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
- : "memory"); \
- }
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
- "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
- unsigned long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
-__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
-
-#pragma pop_macro("__INTRINSIC_STORE")
-#pragma pop_macro("__INTRINSIC_STORE2")
-#pragma pop_macro("__INTRINSIC_STORE4")
-
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
#if CUDA_VERSION >= 11000
More information about the cfe-commits
mailing list