[libc-commits] [libc] [libc] Remove CPU dependent AMDGPU instructions (PR #80707)
via libc-commits
libc-commits at lists.llvm.org
Mon Feb 5 08:41:53 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-libc
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
Some recent changes allowed us to remove target level divergence one
these instructions. This patch removes the wavefront dependent
divergence for the ballot and thread ID functions, as well as the clock.
The changes to the "Vendor" library simply disables target specific
optimizations in the implementation. This should be removed in its
entirety when the LLVM `libm` is sufficiently implemented.
The remaining areas of divergence is only the RPC packet size and the
fixed frequency counter.
---
Full diff: https://github.com/llvm/llvm-project/pull/80707.diff
2 Files Affected:
- (modified) libc/src/__support/GPU/amdgpu/utils.h (+4-18)
- (modified) libc/src/math/gpu/vendor/amdgpu/platform.h (+2-82)
``````````diff
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 96e3efccb3b5a5..58bbe29cb3a7d7 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -113,10 +113,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
- if constexpr (LANE_SIZE == 64)
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
- else
- return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+ return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
/// Returns the bit-mask of active threads in the current wavefront.
@@ -134,11 +131,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads
- if constexpr (LANE_SIZE == 64) {
- return lane_mask & __builtin_amdgcn_ballot_w64(x);
- } else {
- return lane_mask & __builtin_amdgcn_ballot_w32(x);
- }
+ return lane_mask & __builtin_amdgcn_ballot_w64(x);
}
/// Waits for all the threads in the block to converge and issues a fence.
@@ -153,15 +146,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
}
/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle cycle counter.
-LIBC_INLINE uint64_t processor_clock() {
- if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
- return __builtin_amdgcn_s_memtime();
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_readcyclecounter))
- return __builtin_readcyclecounter();
- else
- return 0;
-}
+/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
/// the card and can only be queried via the driver.
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/gpu/vendor/amdgpu/platform.h
index 160a8508cd8b95..b22240419cd0c1 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/gpu/vendor/amdgpu/platform.h
@@ -32,88 +32,8 @@ extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
// Disable finite math optimizations.
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
-#if defined(__gfx700__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7000;
-#elif defined(__gfx701__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7001;
-#elif defined(__gfx702__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7002;
-#elif defined(__gfx703__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7003;
-#elif defined(__gfx704__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7004;
-#elif defined(__gfx705__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7005;
-#elif defined(__gfx801__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8001;
-#elif defined(__gfx802__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8002;
-#elif defined(__gfx803__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8003;
-#elif defined(__gfx805__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8005;
-#elif defined(__gfx810__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8100;
-#elif defined(__gfx900__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
-#elif defined(__gfx902__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9002;
-#elif defined(__gfx904__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9004;
-#elif defined(__gfx906__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9006;
-#elif defined(__gfx908__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9008;
-#elif defined(__gfx909__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9009;
-#elif defined(__gfx90a__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9010;
-#elif defined(__gfx90c__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9012;
-#elif defined(__gfx940__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9400;
-#elif defined(__gfx941__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9401;
-#elif defined(__gfx942__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9402;
-#elif defined(__gfx1010__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10100;
-#elif defined(__gfx1011__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10101;
-#elif defined(__gfx1012__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10102;
-#elif defined(__gfx1013__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10103;
-#elif defined(__gfx1030__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10300;
-#elif defined(__gfx1031__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10301;
-#elif defined(__gfx1032__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10302;
-#elif defined(__gfx1033__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10303;
-#elif defined(__gfx1034__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10304;
-#elif defined(__gfx1035__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10305;
-#elif defined(__gfx1036__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10306;
-#elif defined(__gfx1100__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11000;
-#elif defined(__gfx1101__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11001;
-#elif defined(__gfx1102__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11002;
-#elif defined(__gfx1103__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11003;
-#elif defined(__gfx1150__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11500;
-#elif defined(__gfx1151__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11501;
-#else
-#error "Unknown AMDGPU architecture"
-#endif
-}
+// Set the ISA to zero to ingore all ISA specific optimizations.
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 0;
// These aliases cause clang to emit the control constants with ODR linkage.
// This allows us to link against the symbols without preventing them from being
``````````
</details>
https://github.com/llvm/llvm-project/pull/80707
More information about the libc-commits
mailing list