[libc-commits] [libc] [libc] Remove CPU dependent AMDGPU instructions (PR #80707)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Tue Feb 6 05:15:44 PST 2024
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/80707
>From 28bb4977813c827ec9853513baa866985d7fb1e3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 5 Feb 2024 10:35:42 -0600
Subject: [PATCH 1/3] [libc] Remove CPU dependent AMDGPU instructions
Summary:
Some recent changes allowed us to remove target level divergence one
these instructions. This patch removes the wavefront dependent
divergence for the ballot and thread ID functions, as well as the clock.
The changes to the "Vendor" library simply disables target specific
optimizations in the implementation. This should be removed in its
entirety when the LLVM `libm` is sufficiently implemented.
The remaining areas of divergence is only the RPC packet size and the
fixed frequency counter.
---
libc/src/__support/GPU/amdgpu/utils.h | 22 ++----
libc/src/math/gpu/vendor/amdgpu/platform.h | 84 +---------------------
2 files changed, 6 insertions(+), 100 deletions(-)
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 96e3efccb3b5a5..58bbe29cb3a7d7 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -113,10 +113,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
- if constexpr (LANE_SIZE == 64)
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
- else
- return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+ return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
/// Returns the bit-mask of active threads in the current wavefront.
@@ -134,11 +131,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads
- if constexpr (LANE_SIZE == 64) {
- return lane_mask & __builtin_amdgcn_ballot_w64(x);
- } else {
- return lane_mask & __builtin_amdgcn_ballot_w32(x);
- }
+ return lane_mask & __builtin_amdgcn_ballot_w64(x);
}
/// Waits for all the threads in the block to converge and issues a fence.
@@ -153,15 +146,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
}
/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle cycle counter.
-LIBC_INLINE uint64_t processor_clock() {
- if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
- return __builtin_amdgcn_s_memtime();
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_readcyclecounter))
- return __builtin_readcyclecounter();
- else
- return 0;
-}
+/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
/// the card and can only be queried via the driver.
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/gpu/vendor/amdgpu/platform.h
index 160a8508cd8b95..b22240419cd0c1 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/gpu/vendor/amdgpu/platform.h
@@ -32,88 +32,8 @@ extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
// Disable finite math optimizations.
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
-#if defined(__gfx700__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7000;
-#elif defined(__gfx701__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7001;
-#elif defined(__gfx702__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7002;
-#elif defined(__gfx703__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7003;
-#elif defined(__gfx704__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7004;
-#elif defined(__gfx705__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 7005;
-#elif defined(__gfx801__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8001;
-#elif defined(__gfx802__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8002;
-#elif defined(__gfx803__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8003;
-#elif defined(__gfx805__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8005;
-#elif defined(__gfx810__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 8100;
-#elif defined(__gfx900__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
-#elif defined(__gfx902__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9002;
-#elif defined(__gfx904__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9004;
-#elif defined(__gfx906__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9006;
-#elif defined(__gfx908__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9008;
-#elif defined(__gfx909__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9009;
-#elif defined(__gfx90a__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9010;
-#elif defined(__gfx90c__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9012;
-#elif defined(__gfx940__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9400;
-#elif defined(__gfx941__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9401;
-#elif defined(__gfx942__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9402;
-#elif defined(__gfx1010__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10100;
-#elif defined(__gfx1011__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10101;
-#elif defined(__gfx1012__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10102;
-#elif defined(__gfx1013__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10103;
-#elif defined(__gfx1030__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10300;
-#elif defined(__gfx1031__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10301;
-#elif defined(__gfx1032__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10302;
-#elif defined(__gfx1033__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10303;
-#elif defined(__gfx1034__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10304;
-#elif defined(__gfx1035__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10305;
-#elif defined(__gfx1036__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10306;
-#elif defined(__gfx1100__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11000;
-#elif defined(__gfx1101__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11001;
-#elif defined(__gfx1102__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11002;
-#elif defined(__gfx1103__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11003;
-#elif defined(__gfx1150__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11500;
-#elif defined(__gfx1151__)
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 11501;
-#else
-#error "Unknown AMDGPU architecture"
-#endif
-}
+// Set the ISA to zero to ingore all ISA specific optimizations.
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 0;
// These aliases cause clang to emit the control constants with ODR linkage.
// This allows us to link against the symbols without preventing them from being
>From 729afca29e8821e835d08461c4b29507f71b1735 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 5 Feb 2024 12:23:31 -0600
Subject: [PATCH 2/3] Address comments
---
libc/src/math/gpu/vendor/amdgpu/platform.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/gpu/vendor/amdgpu/platform.h
index b22240419cd0c1..7622115d083334 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/gpu/vendor/amdgpu/platform.h
@@ -32,8 +32,8 @@ extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
// Disable finite math optimizations.
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
-// Set the ISA to zero to ingore all ISA specific optimizations.
-extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 0;
+// Set the ISA to gfx900 to assume FMA instructions are present.
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
// These aliases cause clang to emit the control constants with ODR linkage.
// This allows us to link against the symbols without preventing them from being
>From 0f111fee856bc6de38a74a4ec01c37a858e8037c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 6 Feb 2024 07:15:34 -0600
Subject: [PATCH 3/3] More comment
---
libc/src/math/gpu/vendor/amdgpu/platform.h | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/gpu/vendor/amdgpu/platform.h
index 7622115d083334..cee01e938e3039 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/gpu/vendor/amdgpu/platform.h
@@ -32,7 +32,10 @@ extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
// Disable finite math optimizations.
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
-// Set the ISA to gfx900 to assume FMA instructions are present.
+// Set the ISA value to a high enough value that the ROCm device library math
+// functions will assume we have fast FMA operations among other features. This
+// is determined to be safe on all targets by looking at the source code.
+// https://github.com/ROCm/ROCm-Device-Libs/blob/amd-stg-open/ocml/src/opts.h
extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
// These aliases cause clang to emit the control constants with ODR linkage.
More information about the libc-commits
mailing list