[flang-commits] [clang] [compiler-rt] [flang] [libc] [libclc] [libcxx] [libcxxabi] [lld] [lldb] [llvm] [mlir] [libc++] implement ranges::find_last (PR #91081)
Andrew Sukach via flang-commits
flang-commits at lists.llvm.org
Sat May 4 14:18:54 PDT 2024
https://github.com/soukatch updated https://github.com/llvm/llvm-project/pull/91081
>From 4ef4041a17fd8a06bebb959d6afdab105141c2ed Mon Sep 17 00:00:00 2001
From: Andrew Sukach <andrewsukach at gmail.com>
Date: Sat, 4 May 2024 15:30:06 -0400
Subject: [PATCH 01/10] [libc++] implement ranges::find_last
---
libcxx/include/CMakeLists.txt | 1 +
libcxx/include/__algorithm/ranges_find_last.h | 83 +++++++++++++++++++
libcxx/include/algorithm | 1 +
libcxx/include/module.modulemap | 1 +
.../gn/secondary/libcxx/include/BUILD.gn | 1 +
5 files changed, 87 insertions(+)
create mode 100644 libcxx/include/__algorithm/ranges_find_last.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index fd7eb125e007b6..56cce0a42ca771 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -113,6 +113,7 @@ set(files
__algorithm/ranges_find_first_of.h
__algorithm/ranges_find_if.h
__algorithm/ranges_find_if_not.h
+ __algorithm/ranges_find_last.h
__algorithm/ranges_for_each.h
__algorithm/ranges_for_each_n.h
__algorithm/ranges_generate.h
diff --git a/libcxx/include/__algorithm/ranges_find_last.h b/libcxx/include/__algorithm/ranges_find_last.h
new file mode 100644
index 00000000000000..94413e673f0719
--- /dev/null
+++ b/libcxx/include/__algorithm/ranges_find_last.h
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+#define _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+
+#include <__algorithm/ranges_find.h>
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/projected.h>
+#include <__iterator/reverse_iterator.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/dangling.h>
+#include <__ranges/subrange.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+namespace __find_last {
+struct __fn {
+ template <forward_iterator _It, sentinel_for<_It> _Sent, typename _Tp, typename _Proj = identity>
+ requires indirect_binary_predicate<equal_to, projected<_It, _Proj>, const _Tp*>
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr subrange<_It>
+ operator()(_It __first, _Sent __last, const _Tp& __value, _Proj __proj = {}) const {
+ if constexpr (same_as<_It, _Sent> && bidirectional_iterator<_It>) {
+ const auto __found{find(reverse_iterator{__last}, reverse_iterator{__first}, __value, std::move(__proj)).base()};
+ if (__found == __first)
+ return {__last, __last};
+ return {prev(__found), __last};
+ } else {
+ auto __found{find(__first, __last, __value, __proj)};
+ if (__found == __last)
+ return {__last, __last};
+
+ for (__first = __found;; __found = __first++)
+ if ((__first == find(__first, __last, __value, __proj)) == __last)
+ return {__found, __last};
+ }
+ }
+
+ template <forward_range _Range, typename _Tp, typename _Proj = identity>
+ requires indirect_binary_predicate<equal_to, projected<iterator_t<_Range>, _Proj>, const _Tp*>
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr borrowed_subrange_t<_Range>
+ operator()(_Range&& __r, const _Tp& __value, _Proj __proj = {}) const {
+ return this->operator()(begin(__r), end(__r), __value, std::move(__proj));
+ }
+};
+
+} // namespace __find_last
+
+inline namespace __cpo {
+inline constexpr __find_last::__fn find_last{};
+} // namespace __cpo
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 869fc19737b572..508579c75610a4 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -2004,6 +2004,7 @@ template <class BidirectionalIterator, class Compare>
# include <__algorithm/fold.h>
# include <__algorithm/ranges_contains_subrange.h>
# include <__algorithm/ranges_ends_with.h>
+# include <__algorithm/ranges_find_last.h>
# include <__algorithm/ranges_starts_with.h>
#endif // _LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 2974d12500c4cb..f354bca3ca45b3 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -787,6 +787,7 @@ module std_private_algorithm_ranges_find_end [system
module std_private_algorithm_ranges_find_first_of [system] { header "__algorithm/ranges_find_first_of.h" }
module std_private_algorithm_ranges_find_if [system] { header "__algorithm/ranges_find_if.h" }
module std_private_algorithm_ranges_find_if_not [system] { header "__algorithm/ranges_find_if_not.h" }
+module std_private_algorithm_ranges_find_last [system] { header "__algorithm/ranges_find_last.h" }
module std_private_algorithm_ranges_for_each [system] {
header "__algorithm/ranges_for_each.h"
export std_private_algorithm_in_fun_result
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 9645bff18ae72b..789d03e66d3c37 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -183,6 +183,7 @@ if (current_toolchain == default_toolchain) {
"__algorithm/ranges_find_first_of.h",
"__algorithm/ranges_find_if.h",
"__algorithm/ranges_find_if_not.h",
+ "__algorithm/ranges_find_last.h",
"__algorithm/ranges_for_each.h",
"__algorithm/ranges_for_each_n.h",
"__algorithm/ranges_generate.h",
>From cc1198f7b81133a5d4ab9f0b189cda0e6a623314 Mon Sep 17 00:00:00 2001
From: luolent <56246516+luolent at users.noreply.github.com>
Date: Sat, 4 May 2024 20:38:45 +0300
Subject: [PATCH 02/10] Add clarifying parenthesis around non-trivial
conditions in ternary expressions. (#90391)
Fixes [#85868](https://github.com/llvm/llvm-project/issues/85868)
Parenthesis are added as requested on ternary operators with non trivial conditions.
I used this [precedence table](https://en.cppreference.com/w/cpp/language/operator_precedence) for reference, to make sure we get the expected behavior on each change.
---
clang/lib/Basic/Targets/AMDGPU.cpp | 2 +-
compiler-rt/lib/xray/xray_utils.h | 2 +-
libc/src/__support/FPUtil/aarch64/FEnvImpl.h | 20 ++++----
.../FPUtil/aarch64/fenv_darwin_impl.h | 48 +++++++++----------
libc/src/__support/FPUtil/arm/FEnvImpl.h | 40 ++++++++--------
libc/src/__support/FPUtil/riscv/FEnvImpl.h | 20 ++++----
libc/src/__support/FPUtil/x86_64/FEnvImpl.h | 24 +++++-----
libclc/generic/lib/math/log_base.h | 2 +-
libcxxabi/src/cxa_personality.cpp | 4 +-
lld/ELF/LinkerScript.cpp | 2 +-
.../source/MacOSX/MachException.cpp | 2 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 4 +-
.../Disassembler/AMDGPUDisassembler.cpp | 4 +-
llvm/lib/Target/AVR/AVRAsmPrinter.cpp | 4 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 4 +-
.../lib/Target/X86/AsmParser/X86AsmParser.cpp | 2 +-
.../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 2 +-
.../Transforms/TosaDecomposeTransposeConv.cpp | 4 +-
18 files changed, 94 insertions(+), 96 deletions(-)
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 5742885df0461b..cc7be64656e5b2 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -232,7 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
HasLegalHalfType = true;
HasFloat16 = true;
- WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64;
+ WavefrontSize = (GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32) ? 32 : 64;
AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics;
// Set pointer width and alignment for the generic address space.
diff --git a/compiler-rt/lib/xray/xray_utils.h b/compiler-rt/lib/xray/xray_utils.h
index 333826168c0db2..5dc73d7fa8cdea 100644
--- a/compiler-rt/lib/xray/xray_utils.h
+++ b/compiler-rt/lib/xray/xray_utils.h
@@ -61,7 +61,7 @@ constexpr size_t gcd(size_t a, size_t b) {
constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
constexpr size_t nearest_boundary(size_t number, size_t multiple) {
- return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+ return multiple * ((number / multiple) + ((number % multiple) ? 1 : 0));
}
constexpr size_t next_pow2_helper(size_t num, size_t acc) {
diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
index d1d92169475d15..cd8a5970edd65a 100644
--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
@@ -53,19 +53,19 @@ struct FEnv {
static constexpr uint32_t ExceptionControlFlagsBitPosition = 8;
LIBC_INLINE static uint32_t getStatusValueForExcept(int excepts) {
- return (excepts & FE_INVALID ? INVALID : 0) |
- (excepts & FE_DIVBYZERO ? DIVBYZERO : 0) |
- (excepts & FE_OVERFLOW ? OVERFLOW : 0) |
- (excepts & FE_UNDERFLOW ? UNDERFLOW : 0) |
- (excepts & FE_INEXACT ? INEXACT : 0);
+ return ((excepts & FE_INVALID) ? INVALID : 0) |
+ ((excepts & FE_DIVBYZERO) ? DIVBYZERO : 0) |
+ ((excepts & FE_OVERFLOW) ? OVERFLOW : 0) |
+ ((excepts & FE_UNDERFLOW) ? UNDERFLOW : 0) |
+ ((excepts & FE_INEXACT) ? INEXACT : 0);
}
LIBC_INLINE static int exceptionStatusToMacro(uint32_t status) {
- return (status & INVALID ? FE_INVALID : 0) |
- (status & DIVBYZERO ? FE_DIVBYZERO : 0) |
- (status & OVERFLOW ? FE_OVERFLOW : 0) |
- (status & UNDERFLOW ? FE_UNDERFLOW : 0) |
- (status & INEXACT ? FE_INEXACT : 0);
+ return ((status & INVALID) ? FE_INVALID : 0) |
+ ((status & DIVBYZERO) ? FE_DIVBYZERO : 0) |
+ ((status & OVERFLOW) ? FE_OVERFLOW : 0) |
+ ((status & UNDERFLOW) ? FE_UNDERFLOW : 0) |
+ ((status & INEXACT) ? FE_INEXACT : 0);
}
static uint32_t getControlWord() {
diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
index 5b59ba38d67bb6..feb48e3719bf16 100644
--- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
@@ -63,39 +63,39 @@ struct FEnv {
// located in a different place from FE_FLUSHTOZERO status bit relative to
// the other exceptions.
LIBC_INLINE static uint32_t exception_value_from_status(int status) {
- return (status & FE_INVALID ? EX_INVALID : 0) |
- (status & FE_DIVBYZERO ? EX_DIVBYZERO : 0) |
- (status & FE_OVERFLOW ? EX_OVERFLOW : 0) |
- (status & FE_UNDERFLOW ? EX_UNDERFLOW : 0) |
- (status & FE_INEXACT ? EX_INEXACT : 0) |
- (status & FE_FLUSHTOZERO ? EX_FLUSHTOZERO : 0);
+ return ((status & FE_INVALID) ? EX_INVALID : 0) |
+ ((status & FE_DIVBYZERO) ? EX_DIVBYZERO : 0) |
+ ((status & FE_OVERFLOW) ? EX_OVERFLOW : 0) |
+ ((status & FE_UNDERFLOW) ? EX_UNDERFLOW : 0) |
+ ((status & FE_INEXACT) ? EX_INEXACT : 0) |
+ ((status & FE_FLUSHTOZERO) ? EX_FLUSHTOZERO : 0);
}
LIBC_INLINE static uint32_t exception_value_from_control(int control) {
- return (control & __fpcr_trap_invalid ? EX_INVALID : 0) |
- (control & __fpcr_trap_divbyzero ? EX_DIVBYZERO : 0) |
- (control & __fpcr_trap_overflow ? EX_OVERFLOW : 0) |
- (control & __fpcr_trap_underflow ? EX_UNDERFLOW : 0) |
- (control & __fpcr_trap_inexact ? EX_INEXACT : 0) |
- (control & __fpcr_flush_to_zero ? EX_FLUSHTOZERO : 0);
+ return ((control & __fpcr_trap_invalid) ? EX_INVALID : 0) |
+ ((control & __fpcr_trap_divbyzero) ? EX_DIVBYZERO : 0) |
+ ((control & __fpcr_trap_overflow) ? EX_OVERFLOW : 0) |
+ ((control & __fpcr_trap_underflow) ? EX_UNDERFLOW : 0) |
+ ((control & __fpcr_trap_inexact) ? EX_INEXACT : 0) |
+ ((control & __fpcr_flush_to_zero) ? EX_FLUSHTOZERO : 0);
}
LIBC_INLINE static int exception_value_to_status(uint32_t excepts) {
- return (excepts & EX_INVALID ? FE_INVALID : 0) |
- (excepts & EX_DIVBYZERO ? FE_DIVBYZERO : 0) |
- (excepts & EX_OVERFLOW ? FE_OVERFLOW : 0) |
- (excepts & EX_UNDERFLOW ? FE_UNDERFLOW : 0) |
- (excepts & EX_INEXACT ? FE_INEXACT : 0) |
- (excepts & EX_FLUSHTOZERO ? FE_FLUSHTOZERO : 0);
+ return ((excepts & EX_INVALID) ? FE_INVALID : 0) |
+ ((excepts & EX_DIVBYZERO) ? FE_DIVBYZERO : 0) |
+ ((excepts & EX_OVERFLOW) ? FE_OVERFLOW : 0) |
+ ((excepts & EX_UNDERFLOW) ? FE_UNDERFLOW : 0) |
+ ((excepts & EX_INEXACT) ? FE_INEXACT : 0) |
+ ((excepts & EX_FLUSHTOZERO) ? FE_FLUSHTOZERO : 0);
}
LIBC_INLINE static int exception_value_to_control(uint32_t excepts) {
- return (excepts & EX_INVALID ? __fpcr_trap_invalid : 0) |
- (excepts & EX_DIVBYZERO ? __fpcr_trap_divbyzero : 0) |
- (excepts & EX_OVERFLOW ? __fpcr_trap_overflow : 0) |
- (excepts & EX_UNDERFLOW ? __fpcr_trap_underflow : 0) |
- (excepts & EX_INEXACT ? __fpcr_trap_inexact : 0) |
- (excepts & EX_FLUSHTOZERO ? __fpcr_flush_to_zero : 0);
+ return ((excepts & EX_INVALID) ? __fpcr_trap_invalid : 0) |
+ ((excepts & EX_DIVBYZERO) ? __fpcr_trap_divbyzero : 0) |
+ ((excepts & EX_OVERFLOW) ? __fpcr_trap_overflow : 0) |
+ ((excepts & EX_UNDERFLOW) ? __fpcr_trap_underflow : 0) |
+ ((excepts & EX_INEXACT) ? __fpcr_trap_inexact : 0) |
+ ((excepts & EX_FLUSHTOZERO) ? __fpcr_flush_to_zero : 0);
}
LIBC_INLINE static uint32_t get_control_word() { return __arm_rsr("fpcr"); }
diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index 78fbda4f7afff1..cb8d31d683af39 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -50,35 +50,35 @@ struct FEnv {
}
LIBC_INLINE static int exception_enable_bits_to_macro(uint32_t status) {
- return (status & INVALID_ENABLE ? FE_INVALID : 0) |
- (status & DIVBYZERO_ENABLE ? FE_DIVBYZERO : 0) |
- (status & OVERFLOW_ENABLE ? FE_OVERFLOW : 0) |
- (status & UNDERFLOW_ENABLE ? FE_UNDERFLOW : 0) |
- (status & INEXACT_ENABLE ? FE_INEXACT : 0);
+ return ((status & INVALID_ENABLE) ? FE_INVALID : 0) |
+ ((status & DIVBYZERO_ENABLE) ? FE_DIVBYZERO : 0) |
+ ((status & OVERFLOW_ENABLE) ? FE_OVERFLOW : 0) |
+ ((status & UNDERFLOW_ENABLE) ? FE_UNDERFLOW : 0) |
+ ((status & INEXACT_ENABLE) ? FE_INEXACT : 0);
}
LIBC_INLINE static uint32_t exception_macro_to_enable_bits(int except) {
- return (except & FE_INVALID ? INVALID_ENABLE : 0) |
- (except & FE_DIVBYZERO ? DIVBYZERO_ENABLE : 0) |
- (except & FE_OVERFLOW ? OVERFLOW_ENABLE : 0) |
- (except & FE_UNDERFLOW ? UNDERFLOW_ENABLE : 0) |
- (except & FE_INEXACT ? INEXACT_ENABLE : 0);
+ return ((except & FE_INVALID) ? INVALID_ENABLE : 0) |
+ ((except & FE_DIVBYZERO) ? DIVBYZERO_ENABLE : 0) |
+ ((except & FE_OVERFLOW) ? OVERFLOW_ENABLE : 0) |
+ ((except & FE_UNDERFLOW) ? UNDERFLOW_ENABLE : 0) |
+ ((except & FE_INEXACT) ? INEXACT_ENABLE : 0);
}
LIBC_INLINE static uint32_t exception_macro_to_status_bits(int except) {
- return (except & FE_INVALID ? INVALID_STATUS : 0) |
- (except & FE_DIVBYZERO ? DIVBYZERO_STATUS : 0) |
- (except & FE_OVERFLOW ? OVERFLOW_STATUS : 0) |
- (except & FE_UNDERFLOW ? UNDERFLOW_STATUS : 0) |
- (except & FE_INEXACT ? INEXACT_STATUS : 0);
+ return ((except & FE_INVALID) ? INVALID_STATUS : 0) |
+ ((except & FE_DIVBYZERO) ? DIVBYZERO_STATUS : 0) |
+ ((except & FE_OVERFLOW) ? OVERFLOW_STATUS : 0) |
+ ((except & FE_UNDERFLOW) ? UNDERFLOW_STATUS : 0) |
+ ((except & FE_INEXACT) ? INEXACT_STATUS : 0);
}
LIBC_INLINE static uint32_t exception_status_bits_to_macro(int status) {
- return (status & INVALID_STATUS ? FE_INVALID : 0) |
- (status & DIVBYZERO_STATUS ? FE_DIVBYZERO : 0) |
- (status & OVERFLOW_STATUS ? FE_OVERFLOW : 0) |
- (status & UNDERFLOW_STATUS ? FE_UNDERFLOW : 0) |
- (status & INEXACT_STATUS ? FE_INEXACT : 0);
+ return ((status & INVALID_STATUS) ? FE_INVALID : 0) |
+ ((status & DIVBYZERO_STATUS) ? FE_DIVBYZERO : 0) |
+ ((status & OVERFLOW_STATUS) ? FE_OVERFLOW : 0) |
+ ((status & UNDERFLOW_STATUS) ? FE_UNDERFLOW : 0) |
+ ((status & INEXACT_STATUS) ? FE_INEXACT : 0);
}
};
diff --git a/libc/src/__support/FPUtil/riscv/FEnvImpl.h b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
index e7aee3ba4b9109..1de464a89de482 100644
--- a/libc/src/__support/FPUtil/riscv/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
@@ -65,19 +65,19 @@ struct FEnv {
}
LIBC_INLINE static int exception_bits_to_macro(uint32_t status) {
- return (status & INVALID ? FE_INVALID : 0) |
- (status & DIVBYZERO ? FE_DIVBYZERO : 0) |
- (status & OVERFLOW ? FE_OVERFLOW : 0) |
- (status & UNDERFLOW ? FE_UNDERFLOW : 0) |
- (status & INEXACT ? FE_INEXACT : 0);
+ return ((status & INVALID) ? FE_INVALID : 0) |
+ ((status & DIVBYZERO) ? FE_DIVBYZERO : 0) |
+ ((status & OVERFLOW) ? FE_OVERFLOW : 0) |
+ ((status & UNDERFLOW) ? FE_UNDERFLOW : 0) |
+ ((status & INEXACT) ? FE_INEXACT : 0);
}
LIBC_INLINE static uint32_t exception_macro_to_bits(int except) {
- return (except & FE_INVALID ? INVALID : 0) |
- (except & FE_DIVBYZERO ? DIVBYZERO : 0) |
- (except & FE_OVERFLOW ? OVERFLOW : 0) |
- (except & FE_UNDERFLOW ? UNDERFLOW : 0) |
- (except & FE_INEXACT ? INEXACT : 0);
+ return ((except & FE_INVALID) ? INVALID : 0) |
+ ((except & FE_DIVBYZERO) ? DIVBYZERO : 0) |
+ ((except & FE_OVERFLOW) ? OVERFLOW : 0) |
+ ((except & FE_UNDERFLOW) ? UNDERFLOW : 0) |
+ ((except & FE_INEXACT) ? INEXACT : 0);
}
};
diff --git a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
index 0595658d7df328..a157b81aaaf325 100644
--- a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
@@ -72,25 +72,25 @@ static constexpr uint16_t MXCSR_EXCEPTION_CONTOL_BIT_POISTION = 7;
LIBC_INLINE uint16_t get_status_value_for_except(int excepts) {
// We will make use of the fact that exception control bits are single
// bit flags in the control registers.
- return (excepts & FE_INVALID ? ExceptionFlags::INVALID_F : 0) |
+ return ((excepts & FE_INVALID) ? ExceptionFlags::INVALID_F : 0) |
#ifdef __FE_DENORM
- (excepts & __FE_DENORM ? ExceptionFlags::DENORMAL_F : 0) |
+ ((excepts & __FE_DENORM) ? ExceptionFlags::DENORMAL_F : 0) |
#endif // __FE_DENORM
- (excepts & FE_DIVBYZERO ? ExceptionFlags::DIV_BY_ZERO_F : 0) |
- (excepts & FE_OVERFLOW ? ExceptionFlags::OVERFLOW_F : 0) |
- (excepts & FE_UNDERFLOW ? ExceptionFlags::UNDERFLOW_F : 0) |
- (excepts & FE_INEXACT ? ExceptionFlags::INEXACT_F : 0);
+ ((excepts & FE_DIVBYZERO) ? ExceptionFlags::DIV_BY_ZERO_F : 0) |
+ ((excepts & FE_OVERFLOW) ? ExceptionFlags::OVERFLOW_F : 0) |
+ ((excepts & FE_UNDERFLOW) ? ExceptionFlags::UNDERFLOW_F : 0) |
+ ((excepts & FE_INEXACT) ? ExceptionFlags::INEXACT_F : 0);
}
LIBC_INLINE int exception_status_to_macro(uint16_t status) {
- return (status & ExceptionFlags::INVALID_F ? FE_INVALID : 0) |
+ return ((status & ExceptionFlags::INVALID_F) ? FE_INVALID : 0) |
#ifdef __FE_DENORM
- (status & ExceptionFlags::DENORMAL_F ? __FE_DENORM : 0) |
+ ((status & ExceptionFlags::DENORMAL_F) ? __FE_DENORM : 0) |
#endif // __FE_DENORM
- (status & ExceptionFlags::DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
- (status & ExceptionFlags::OVERFLOW_F ? FE_OVERFLOW : 0) |
- (status & ExceptionFlags::UNDERFLOW_F ? FE_UNDERFLOW : 0) |
- (status & ExceptionFlags::INEXACT_F ? FE_INEXACT : 0);
+ ((status & ExceptionFlags::DIV_BY_ZERO_F) ? FE_DIVBYZERO : 0) |
+ ((status & ExceptionFlags::OVERFLOW_F) ? FE_OVERFLOW : 0) |
+ ((status & ExceptionFlags::UNDERFLOW_F) ? FE_UNDERFLOW : 0) |
+ ((status & ExceptionFlags::INEXACT_F) ? FE_INEXACT : 0);
}
struct X87StateDescriptor {
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
index f5b6f1cb44991a..2558f016f60bef 100644
--- a/libclc/generic/lib/math/log_base.h
+++ b/libclc/generic/lib/math/log_base.h
@@ -289,7 +289,7 @@ log(double x)
double ret = is_near ? ret_near : ret_far;
ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret;
- ret = isnan(x) | (x < 0.0) ? as_double(QNANBITPATT_DP64) : ret;
+ ret = (isnan(x) | (x < 0.0)) ? as_double(QNANBITPATT_DP64) : ret;
ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret;
return ret;
}
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index 4b6c4edbc26698..d95d781319401b 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -717,9 +717,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
if (actionEntry == 0)
{
// Found a cleanup
- results.reason = actions & _UA_SEARCH_PHASE
- ? _URC_CONTINUE_UNWIND
- : _URC_HANDLER_FOUND;
+ results.reason = (actions & _UA_SEARCH_PHASE) ? _URC_CONTINUE_UNWIND : _URC_HANDLER_FOUND;
return;
}
// Convert 1-based byte offset into
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index f815b3ac6feeda..f9d8dcc4f71d95 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -801,7 +801,7 @@ static OutputDesc *addInputSec(StringMap<TinyPtrVector<OutputSection *>> &map,
auto *firstIsec = cast<InputSectionBase>(
cast<InputSectionDescription>(sec->commands[0])->sectionBases[0]);
OutputSection *firstIsecOut =
- firstIsec->flags & SHF_LINK_ORDER
+ (firstIsec->flags & SHF_LINK_ORDER)
? firstIsec->getLinkOrderDep()->getOutputSection()
: nullptr;
if (firstIsecOut != isec->getLinkOrderDep()->getOutputSection())
diff --git a/lldb/tools/debugserver/source/MacOSX/MachException.cpp b/lldb/tools/debugserver/source/MacOSX/MachException.cpp
index eab4cdfc8b775d..659fb2ff8186df 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachException.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachException.cpp
@@ -247,7 +247,7 @@ kern_return_t MachException::Message::Receive(mach_port_t port,
DNBError err;
const bool log_exceptions = DNBLogCheckLogBit(LOG_EXCEPTIONS);
mach_msg_timeout_t mach_msg_timeout =
- options & MACH_RCV_TIMEOUT ? timeout : 0;
+ (options & MACH_RCV_TIMEOUT) ? timeout : 0;
if (log_exceptions && ((options & MACH_RCV_TIMEOUT) == 0)) {
// Dump this log message if we have no timeout in case it never returns
DNBLogThreaded("::mach_msg ( msg->{bits = %#x, size = %u remote_port = "
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 442a9c12b42f24..e7930b68972e73 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3910,8 +3910,8 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
- int RSrcOpName = Desc.TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
- : AMDGPU::OpName::rsrc;
+ int RSrcOpName = (Desc.TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
+ : AMDGPU::OpName::rsrc;
int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RSrcOpName);
int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 8fd36b84a00cd8..05063c6c321a6a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -921,8 +921,8 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AMDGPU::OpName::vdata);
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
- int RsrcOpName = TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
- : AMDGPU::OpName::rsrc;
+ int RsrcOpName = (TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc
+ : AMDGPU::OpName::rsrc;
int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 1c8213b668f71a..aaec545fc1feaa 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -134,8 +134,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
Reg = MI->getOperand(OpNum + RegIdx).getReg();
if (BytesPerReg == 2) {
- Reg = TRI.getSubReg(Reg,
- ByteNumber % BytesPerReg ? AVR::sub_hi : AVR::sub_lo);
+ Reg = TRI.getSubReg(Reg, (ByteNumber % BytesPerReg) ? AVR::sub_hi
+ : AVR::sub_lo);
}
O << AVRInstPrinter::getPrettyRegisterName(Reg, MRI);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 3238b62218dbaa..0d1aab89c5de89 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -2146,8 +2146,8 @@ prepareCompareSwapOperands(MachineBasicBlock::iterator const MBBI) const {
unsigned SystemZ::reverseCCMask(unsigned CCMask) {
return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
- (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
- (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+ ((CCMask & SystemZ::CCMASK_CMP_GT) ? SystemZ::CCMASK_CMP_LT : 0) |
+ ((CCMask & SystemZ::CCMASK_CMP_LT) ? SystemZ::CCMASK_CMP_GT : 0) |
(CCMask & SystemZ::CCMASK_CMP_UO));
}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b05a036fb2f06b..2c2dc21f191d7a 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3802,7 +3802,7 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
// VFMULCPHZrr Dest, Src1, Src2
// VFMULCPHZrrk Dest, Dest, Mask, Src1, Src2
// VFMULCPHZrrkz Dest, Mask, Src1, Src2
- for (unsigned i = TSFlags & X86II::EVEX_K ? 2 : 1;
+ for (unsigned i = ((TSFlags & X86II::EVEX_K) ? 2 : 1);
i < Inst.getNumOperands(); i++)
if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
return Warning(Ops[0]->getStartLoc(), "Destination register should be "
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a5859f98bae026..b4633b91bee322 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -980,7 +980,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
break;
case X86II::VEX:
// VEX can be 2 byte or 3 byte, not determined yet if not explicit
- Prefix.setLowerBound(MI.getFlags() & X86::IP_USE_VEX3 ? VEX3 : VEX2);
+ Prefix.setLowerBound((MI.getFlags() & X86::IP_USE_VEX3) ? VEX3 : VEX2);
break;
case X86II::EVEX:
Prefix.setLowerBound(EVEX);
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 8d937217d70655..a94bb3a920b1db 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -182,9 +182,9 @@ class TransposeConvStridedConverter
// Pad the weight so that it is modulo of the striding.
llvm::SmallVector<int32_t, 8> weightPadding = {0, 0, 0, 0, 0, 0, 0, 0};
weightPadding[3] =
- weightHeight % stride[0] ? stride[0] - weightHeight % stride[0] : 0;
+ (weightHeight % stride[0]) ? (stride[0] - weightHeight % stride[0]) : 0;
weightPadding[5] =
- weightWidth % stride[1] ? stride[1] - weightWidth % stride[1] : 0;
+ (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0;
DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get(
RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding);
Value weightPaddingVal = createOpAndInfer<tosa::ConstOp>(
>From dc5bc5d02d2a3e64df299b8aea68c134fa385b66 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Sat, 4 May 2024 19:57:07 +0200
Subject: [PATCH 03/10] Revert 4 last AMDGPU commits to unbreak Windows bots
Revert "AMDGPU: Try to fix build error with old gcc"
This reverts commit c7ad12d0d7606b0b9fb531b0b273bdc5f1490ddb.
Revert "AMDGPU: Use umin in set.rounding expansion"
This reverts commit a56f0b51dd988ad2b533de759c98457c1ed42456.
Revert "AMDGPU: Optimize set_rounding if input is known to fit in 2 bits (#88588)"
This reverts commit b4e751e2ab0ff152ed18dea59ebf9691e963e1dd.
Revert "AMDGPU: Implement llvm.set.rounding (#88587)"
This reverts commit 9731b77e80261c627d79980f8c275700bdaf6591.
---
llvm/docs/AMDGPUUsage.rst | 6 -
llvm/docs/LangRef.rst | 2 -
llvm/docs/ReleaseNotes.rst | 2 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 88 -
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
.../Target/AMDGPU/SIModeRegisterDefaults.cpp | 119 --
.../Target/AMDGPU/SIModeRegisterDefaults.h | 7 -
llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 1665 -----------------
8 files changed, 1890 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 51969be85648f1..029db00134c09d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1157,12 +1157,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
register do not exactly match the FLT_ROUNDS values,
so a conversion is performed.
- :ref:`llvm.set.rounding<int_set_rounding>` Input value expected to be one of the valid results
- from '``llvm.get.rounding``'. Rounding mode is
- undefined if not passed a valid input. This should be
- a wave uniform value. In case of a divergent input
- value, the first active lane's value will be used.
-
:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
This stores information related to the current rounding mode,
denormalization mode, enabled traps, and floating point exceptions.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 2077fdd841fcd6..6291a4e57919a5 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -26739,8 +26739,6 @@ specified by C standard:
Other values may be used to represent additional rounding modes, supported by a
target. These values are target-specific.
-.. _int_set_rounding:
-
'``llvm.set.rounding``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 0f4e2759de08ac..59c0d4dd2376dd 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -81,8 +81,6 @@ Changes to the AMDGPU Backend
* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
-* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
-
Changes to the ARM Backend
--------------------------
* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ed41c10b50d323..cb4efdc7cf657c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -877,7 +877,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
- setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
@@ -4060,91 +4059,6 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
}
-SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc SL(Op);
-
- SDValue NewMode = Op.getOperand(1);
- assert(NewMode.getValueType() == MVT::i32);
-
- // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
- // hardware MODE.fp_round values.
- if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
- uint32_t ClampedVal = std::min(
- static_cast<uint32_t>(ConstMode->getZExtValue()),
- static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
- NewMode = DAG.getConstant(
- AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
- } else {
- // If we know the input can only be one of the supported standard modes in
- // the range 0-3, we can use a simplified mapping to hardware values.
- KnownBits KB = DAG.computeKnownBits(NewMode);
- const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
- // The supported standard values are 0-3. The extended values start at 8. We
- // need to offset by 4 if the value is in the extended range.
-
- if (UseReducedTable) {
- // Truncate to the low 32-bits.
- SDValue BitTable = DAG.getConstant(
- AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
-
- SDValue Two = DAG.getConstant(2, SL, MVT::i32);
- SDValue RoundModeTimesNumBits =
- DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
-
- NewMode =
- DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
-
- // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
- // the table extracted bits into inline immediates.
- } else {
- // table_index = umin(value, value - 4)
- // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
- SDValue BitTable =
- DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
-
- SDValue Four = DAG.getConstant(4, SL, MVT::i32);
- SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
- SDValue IndexVal =
- DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
-
- SDValue Two = DAG.getConstant(2, SL, MVT::i32);
- SDValue RoundModeTimesNumBits =
- DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
-
- SDValue TableValue =
- DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
- SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
-
- // No need to mask out the high bits since the setreg will ignore them
- // anyway.
- NewMode = TruncTable;
- }
-
- // Insert a readfirstlane in case the value is a VGPR. We could do this
- // earlier and keep more operations scalar, but that interferes with
- // combining the source.
- SDValue ReadFirstLaneID =
- DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
- NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
- ReadFirstLaneID, NewMode);
- }
-
- // N.B. The setreg will be later folded into s_round_mode on supported
- // targets.
- SDValue IntrinID =
- DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
- uint32_t BothRoundHwReg =
- AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
- SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
-
- SDValue SetReg =
- DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
- IntrinID, RoundBothImm, NewMode);
-
- return SetReg;
-}
-
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
if (Op->isDivergent())
return SDValue();
@@ -5840,8 +5754,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTACKSAVE(Op, DAG);
case ISD::GET_ROUNDING:
return lowerGET_ROUNDING(Op, DAG);
- case ISD::SET_ROUNDING:
- return lowerSET_ROUNDING(Op, DAG);
case ISD::PREFETCH:
return lowerPREFETCH(Op, DAG);
case ISD::FP_EXTEND:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 08aa2a5991631d..9856a2923d38f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -422,7 +422,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
index 72bffc8400fa65..2684a1e3c3358a 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -174,122 +174,3 @@ static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
HWTowardNegative, HWTowardPositive)) ==
TowardNegativeF32_TowardPositiveF64);
-
-// Decode FLT_ROUNDS into the hardware value where the two rounding modes are
-// the same and use a standard value
-static constexpr uint64_t encodeFltRoundsToHWTableSame(uint32_t HWVal,
- uint32_t FltRoundsVal) {
- if (FltRoundsVal > TowardNegative)
- FltRoundsVal -= ExtendedFltRoundOffset;
-
- return static_cast<uint64_t>(getModeRegisterRoundMode(HWVal, HWVal))
- << (FltRoundsVal << 2);
-}
-
-/// Decode FLT_ROUNDS into the hardware value where the two rounding modes
-/// different and use an extended value.
-static constexpr uint64_t encodeFltRoundsToHWTable(uint32_t HWF32Val,
- uint32_t HWF64Val,
- uint32_t FltRoundsVal) {
- if (FltRoundsVal > TowardNegative)
- FltRoundsVal -= ExtendedFltRoundOffset;
- return static_cast<uint64_t>(getModeRegisterRoundMode(HWF32Val, HWF64Val))
- << (FltRoundsVal << 2);
-}
-
-constexpr uint64_t AMDGPU::FltRoundToHWConversionTable =
- encodeFltRoundsToHWTableSame(HWTowardZero, TowardZeroF32_TowardZeroF64) |
- encodeFltRoundsToHWTableSame(HWNearestTiesToEven,
- NearestTiesToEvenF32_NearestTiesToEvenF64) |
- encodeFltRoundsToHWTableSame(HWTowardPositive,
- TowardPositiveF32_TowardPositiveF64) |
- encodeFltRoundsToHWTableSame(HWTowardNegative,
- TowardNegativeF32_TowardNegativeF64) |
-
- encodeFltRoundsToHWTable(HWTowardZero, HWNearestTiesToEven,
- TowardZeroF32_NearestTiesToEvenF64) |
- encodeFltRoundsToHWTable(HWTowardZero, HWTowardPositive,
- TowardZeroF32_TowardPositiveF64) |
- encodeFltRoundsToHWTable(HWTowardZero, HWTowardNegative,
- TowardZeroF32_TowardNegativeF64) |
-
- encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardZero,
- NearestTiesToEvenF32_TowardZeroF64) |
- encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardPositive,
- NearestTiesToEvenF32_TowardPositiveF64) |
- encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardNegative,
- NearestTiesToEvenF32_TowardNegativeF64) |
-
- encodeFltRoundsToHWTable(HWTowardPositive, HWTowardZero,
- TowardPositiveF32_TowardZeroF64) |
- encodeFltRoundsToHWTable(HWTowardPositive, HWNearestTiesToEven,
- TowardPositiveF32_NearestTiesToEvenF64) |
- encodeFltRoundsToHWTable(HWTowardPositive, HWTowardNegative,
- TowardPositiveF32_TowardNegativeF64) |
-
- encodeFltRoundsToHWTable(HWTowardNegative, HWTowardZero,
- TowardNegativeF32_TowardZeroF64) |
- encodeFltRoundsToHWTable(HWTowardNegative, HWNearestTiesToEven,
- TowardNegativeF32_NearestTiesToEvenF64) |
- encodeFltRoundsToHWTable(HWTowardNegative, HWTowardPositive,
- TowardNegativeF32_TowardPositiveF64);
-
-/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
-static constexpr uint32_t
-decodeFltRoundToHWConversionTable(uint64_t FltRoundToHWConversionTable,
- uint32_t FltRounds) {
- uint32_t IndexVal = FltRounds;
- if (IndexVal > TowardNegative)
- IndexVal -= ExtendedFltRoundOffset;
- return (FltRoundToHWConversionTable >> (IndexVal << 2)) & 0xf;
-}
-
-uint32_t AMDGPU::decodeFltRoundToHWConversionTable(uint32_t FltRounds) {
- return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
- FltRounds);
-}
-
-static constexpr uint32_t decodeFltRoundToHW(uint32_t FltRounds) {
- return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
- FltRounds);
-}
-
-// Verify evaluation of FltRoundToHWConversionTable
-
-static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardZero) ==
- getModeRegisterRoundMode(HWTowardZero, HWTowardZero));
-static_assert(decodeFltRoundToHW(AMDGPUFltRounds::NearestTiesToEven) ==
- getModeRegisterRoundMode(HWNearestTiesToEven,
- HWNearestTiesToEven));
-static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardPositive) ==
- getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive));
-static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardNegative) ==
- getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative));
-
-static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardPositiveF64) ==
- getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardPositive));
-static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardNegativeF64) ==
- getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardNegative));
-static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardZeroF64) ==
- getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardZero));
-
-static_assert(decodeFltRoundToHW(TowardPositiveF32_NearestTiesToEvenF64) ==
- getModeRegisterRoundMode(HWTowardPositive, HWNearestTiesToEven));
-static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardNegativeF64) ==
- getModeRegisterRoundMode(HWTowardPositive, HWTowardNegative));
-static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardZeroF64) ==
- getModeRegisterRoundMode(HWTowardPositive, HWTowardZero));
-
-static_assert(decodeFltRoundToHW(TowardNegativeF32_NearestTiesToEvenF64) ==
- getModeRegisterRoundMode(HWTowardNegative, HWNearestTiesToEven));
-static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardPositiveF64) ==
- getModeRegisterRoundMode(HWTowardNegative, HWTowardPositive));
-static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardZeroF64) ==
- getModeRegisterRoundMode(HWTowardNegative, HWTowardZero));
-
-static_assert(decodeFltRoundToHW(TowardZeroF32_NearestTiesToEvenF64) ==
- getModeRegisterRoundMode(HWTowardZero, HWNearestTiesToEven));
-static_assert(decodeFltRoundToHW(TowardZeroF32_TowardPositiveF64) ==
- getModeRegisterRoundMode(HWTowardZero, HWTowardPositive));
-static_assert(decodeFltRoundToHW(TowardZeroF32_TowardNegativeF64) ==
- getModeRegisterRoundMode(HWTowardZero, HWTowardNegative));
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
index c86678a7325356..9fbd74c3eede32 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -144,13 +144,6 @@ static constexpr uint32_t F64FltRoundOffset = 2;
// values.
extern const uint64_t FltRoundConversionTable;
-// Bit indexed table to convert from FLT_ROUNDS values to hardware rounding mode
-// values
-extern const uint64_t FltRoundToHWConversionTable;
-
-/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
-uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds);
-
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
deleted file mode 100644
index 48abc49c41ae0a..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ /dev/null
@@ -1,1665 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s
-; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s
-; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
-
-declare void @llvm.set.rounding(i32)
-declare i32 @llvm.get.rounding()
-
-define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
-; GFX678-LABEL: s_set_rounding:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_add_i32 s34, s4, -4
-; GFX678-NEXT: s_min_u32 s34, s4, s34
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s34, s4, -4
-; GFX9-NEXT: s_min_u32 s34, s4, s34
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s34, s4, -4
-; GFX10-NEXT: s_min_u32 s34, s4, s34
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s0, s4, -4
-; GFX11-NEXT: s_min_u32 s0, s4, s0
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
-; GFX6-LABEL: s_set_rounding_kernel:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
-; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s3, s2, -4
-; GFX6-NEXT: s_min_u32 s2, s2, s3
-; GFX6-NEXT: s_lshl_b32 s2, s2, 2
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX6-NEXT: s_endpgm
-;
-; GFX7-LABEL: s_set_rounding_kernel:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dword s2, s[0:1], 0x9
-; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s3, s2, -4
-; GFX7-NEXT: s_min_u32 s2, s2, s3
-; GFX7-NEXT: s_lshl_b32 s2, s2, 2
-; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: s_set_rounding_kernel:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s3, s2, -4
-; GFX8-NEXT: s_min_u32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s2, s2, 2
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: s_set_rounding_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s3, s2, -4
-; GFX9-NEXT: s_min_u32 s2, s2, s3
-; GFX9-NEXT: s_lshl_b32 s2, s2, 2
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: s_set_rounding_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s1, s0, -4
-; GFX10-NEXT: s_min_u32 s2, s0, s1
-; GFX10-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s2, s2, 2
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: s_set_rounding_kernel:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_endpgm
- call void @llvm.set.rounding(i32 %rounding)
- call void asm sideeffect "",""()
- ret void
-}
-
-define void @v_set_rounding(i32 %rounding) {
-; GFX6-LABEL: v_set_rounding:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, -4, v0
-; GFX6-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v0
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_set_rounding:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, -4, v0
-; GFX7-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX7-NEXT: v_lshr_b64 v[0:1], s[4:5], v0
-; GFX7-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_set_rounding:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, -4, v0
-; GFX8-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_set_rounding:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v1, -4, v0
-; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_set_rounding:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v1, -4, v0
-; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX10-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_set_rounding:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, -4, v0
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define void @set_rounding_get_rounding() {
-; GFX678-LABEL: set_rounding_get_rounding:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
-; GFX678-NEXT: s_lshl_b32 s6, s4, 2
-; GFX678-NEXT: s_mov_b32 s4, 0xeb24da71
-; GFX678-NEXT: s_mov_b32 s5, 0xc96f385
-; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX678-NEXT: s_and_b32 s4, s4, 15
-; GFX678-NEXT: s_add_i32 s5, s4, 4
-; GFX678-NEXT: s_cmp_lt_u32 s4, 4
-; GFX678-NEXT: s_cselect_b32 s4, s4, s5
-; GFX678-NEXT: s_add_i32 s5, s4, -4
-; GFX678-NEXT: s_min_u32 s4, s4, s5
-; GFX678-NEXT: s_lshl_b32 s6, s4, 2
-; GFX678-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: set_rounding_get_rounding:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
-; GFX9-NEXT: s_lshl_b32 s6, s4, 2
-; GFX9-NEXT: s_mov_b32 s4, 0xeb24da71
-; GFX9-NEXT: s_mov_b32 s5, 0xc96f385
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX9-NEXT: s_and_b32 s4, s4, 15
-; GFX9-NEXT: s_add_i32 s5, s4, 4
-; GFX9-NEXT: s_cmp_lt_u32 s4, 4
-; GFX9-NEXT: s_cselect_b32 s4, s4, s5
-; GFX9-NEXT: s_add_i32 s5, s4, -4
-; GFX9-NEXT: s_min_u32 s4, s4, s5
-; GFX9-NEXT: s_lshl_b32 s6, s4, 2
-; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: set_rounding_get_rounding:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_getreg_b32 s6, hwreg(HW_REG_MODE, 0, 4)
-; GFX10-NEXT: s_mov_b32 s4, 0xeb24da71
-; GFX10-NEXT: s_mov_b32 s5, 0xc96f385
-; GFX10-NEXT: s_lshl_b32 s6, s6, 2
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX10-NEXT: s_and_b32 s4, s4, 15
-; GFX10-NEXT: s_add_i32 s5, s4, 4
-; GFX10-NEXT: s_cmp_lt_u32 s4, 4
-; GFX10-NEXT: s_cselect_b32 s4, s4, s5
-; GFX10-NEXT: s_add_i32 s5, s4, -4
-; GFX10-NEXT: s_min_u32 s4, s4, s5
-; GFX10-NEXT: s_lshl_b32 s6, s4, 2
-; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: set_rounding_get_rounding:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 4)
-; GFX11-NEXT: s_mov_b32 s0, 0xeb24da71
-; GFX11-NEXT: s_mov_b32 s1, 0xc96f385
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_and_b32 s0, s0, 15
-; GFX11-NEXT: s_add_i32 s1, s0, 4
-; GFX11-NEXT: s_cmp_lt_u32 s0, 4
-; GFX11-NEXT: s_cselect_b32 s0, s0, s1
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %rounding = call i32 @llvm.get.rounding()
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define void @s_set_rounding_0() {
-; GFX678-LABEL: s_set_rounding_0:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_0:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xf
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 0)
- ret void
-}
-
-define void @s_set_rounding_1() {
-; GFX678-LABEL: s_set_rounding_1:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_1:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x0
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 1)
- ret void
-}
-
-define void @s_set_rounding_2() {
-; GFX678-LABEL: s_set_rounding_2:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_2:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x5
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 2)
- ret void
-}
-
-define void @s_set_rounding_3() {
-; GFX678-LABEL: s_set_rounding_3:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_3:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xa
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 3)
- ret void
-}
-
-; Unsupported mode.
-define void @s_set_rounding_4() {
-; GFX678-LABEL: s_set_rounding_4:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_4:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xf
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 4)
- ret void
-}
-
-; undefined
-define void @s_set_rounding_5() {
-; GFX678-LABEL: s_set_rounding_5:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_5:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x0
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 5)
- ret void
-}
-
-; undefined
-define void @s_set_rounding_6() {
-; GFX678-LABEL: s_set_rounding_6:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_6:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 5
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_6:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x5
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 6)
- ret void
-}
-
-; "Dynamic"
-define void @s_set_rounding_7() {
-; GFX678-LABEL: s_set_rounding_7:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_7:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 10
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_7:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xa
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 7)
- ret void
-}
-
-; Invalid
-define void @s_set_rounding_neg1() {
-; GFX678-LABEL: s_set_rounding_neg1:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_neg1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_neg1:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xb
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 -1)
- ret void
-}
-
-; --------------------------------------------------------------------
-; Test extended values
-; --------------------------------------------------------------------
-
-; NearestTiesToEvenF32_TowardPositiveF64 = 8
-define void @s_set_rounding_8() {
-; GFX678-LABEL: s_set_rounding_8:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_8:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x4
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 8)
- ret void
-}
-
-; NearestTiesToEvenF32_TowardNegativeF64 = 9
-define void @s_set_rounding_9() {
-; GFX678-LABEL: s_set_rounding_9:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_9:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_9:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x8
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 9)
- ret void
-}
-
-; NearestTiesToEvenF32_TowardZeroF64 = 10
-define void @s_set_rounding_10() {
-; GFX678-LABEL: s_set_rounding_10:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_10:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 12
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_10:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xc
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 10)
- ret void
-}
-
-; TowardPositiveF32_NearestTiesToEvenF64 = 11
-define void @s_set_rounding_11() {
-; GFX678-LABEL: s_set_rounding_11:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_11:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_11:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x1
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 11)
- ret void
-}
-
-; TowardPositiveF32_TowardNegativeF64 = 12
-define void @s_set_rounding_12() {
-; GFX678-LABEL: s_set_rounding_12:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_12:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 9
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_12:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x9
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 12)
- ret void
-}
-
-; TowardPositiveF32_TowardZeroF64 = 13
-define void @s_set_rounding_13() {
-; GFX678-LABEL: s_set_rounding_13:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_13:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 13
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_13:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xd
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 13)
- ret void
-}
-
-; TowardNegativeF32_NearestTiesToEvenF64 = 14
-define void @s_set_rounding_14() {
-; GFX678-LABEL: s_set_rounding_14:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_14:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_14:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 14)
- ret void
-}
-
-; TowardNegativeF32_TowardPositiveF64 = 15
-define void @s_set_rounding_15() {
-; GFX678-LABEL: s_set_rounding_15:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_15:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 6
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_15:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x6
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 15)
- ret void
-}
-
-
-; TowardNegativeF32_TowardZeroF64 = 16
-define void @s_set_rounding_16() {
-; GFX678-LABEL: s_set_rounding_16:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 14
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_16:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xe
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 16)
- ret void
-}
-
-; TowardZeroF32_NearestTiesToEvenF64 = 17
-define void @s_set_rounding_17() {
-; GFX678-LABEL: s_set_rounding_17:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_17:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_17:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x3
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 17)
- ret void
-}
-
-; TowardZeroF32_TowardPositiveF64 = 18
-define void @s_set_rounding_18() {
-; GFX678-LABEL: s_set_rounding_18:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_18:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 7
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_18:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0x7
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 18)
- ret void
-}
-
-; TowardZeroF32_TowardNegativeF64 = 19,
-define void @s_set_rounding_19() {
-; GFX678-LABEL: s_set_rounding_19:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_19:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_19:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xb
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 19)
- ret void
-}
-
-; Invalid, out of bounds
-define void @s_set_rounding_20() {
-; GFX678-LABEL: s_set_rounding_20:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_20:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_20:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xb
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 20)
- ret void
-}
-
-define void @s_set_rounding_0xffff() {
-; GFX678-LABEL: s_set_rounding_0xffff:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_0xffff:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 11
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: s_set_rounding_0xffff:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: s_round_mode 0xb
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.set.rounding(i32 65535)
- ret void
-}
-
-; --------------------------------------------------------------------
-; Test optimization knowing the value can only be in the standard
-; range
-; --------------------------------------------------------------------
-
-define amdgpu_gfx void @s_set_rounding_i2_zeroext(i2 zeroext inreg %rounding) {
-; GFX6-LABEL: s_set_rounding_i2_zeroext:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s34, s4, 2
-; GFX6-NEXT: s_lshr_b32 s34, 0xa50f, s34
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_i2_zeroext:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_lshl_b32 s34, s4, 2
-; GFX7-NEXT: s_lshr_b32 s34, 0xa50f, s34
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_i2_zeroext:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX8-NEXT: s_lshl_b32 s34, s34, 2
-; GFX8-NEXT: s_lshr_b32 s34, 0xa50f, s34
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_i2_zeroext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX9-NEXT: s_lshl_b32 s34, s34, 2
-; GFX9-NEXT: s_lshr_b32 s34, 0xa50f, s34
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_i2_zeroext:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX10-NEXT: s_lshl_b32 s34, s34, 2
-; GFX10-NEXT: s_lshr_b32 s34, 0xa50f, s34
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_i2_zeroext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
-; GFX11-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-NEXT: s_lshr_b32 s0, 0xa50f, s0
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %zext.rounding = zext i2 %rounding to i32
- call void @llvm.set.rounding(i32 %zext.rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
-; GFX6-LABEL: s_set_rounding_i2_signext:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
-; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_i2_signext:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
-; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_i2_signext:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i16 s34, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
-; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_i2_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s34, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_i2_signext:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s34, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s34, s34, s35
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_i2_signext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s0, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sext.rounding = sext i2 %rounding to i32
- call void @llvm.set.rounding(i32 %sext.rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
-; GFX6-LABEL: s_set_rounding_i3_signext:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
-; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_i3_signext:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
-; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_i3_signext:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i16 s34, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
-; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_i3_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s34, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_i3_signext:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s34, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s34, s34, s35
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_i3_signext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s0, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sext.rounding = sext i3 %rounding to i32
- call void @llvm.set.rounding(i32 %sext.rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
-; GFX6-LABEL: s_set_rounding_i3_zeroext:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
-; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_i3_zeroext:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
-; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_i3_zeroext:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
-; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_i3_zeroext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_i3_zeroext:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s34, s34, s35
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_i3_zeroext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sext.rounding = zext i3 %rounding to i32
- call void @llvm.set.rounding(i32 %sext.rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
-; GFX6-LABEL: s_set_rounding_select_0_1:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX6-NEXT: v_readfirstlane_b32 s34, v0
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_select_0_1:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_cmp_lg_u32 s4, 0
-; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX7-NEXT: v_readfirstlane_b32 s34, v0
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_select_0_1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_cmp_lg_u32 s4, 0
-; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_mov_b32 s34, 0xa50f
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34
-; GFX8-NEXT: v_readfirstlane_b32 s34, v0
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_0_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u32 s4, 0
-; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s34, 0xa50f
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34
-; GFX9-NEXT: v_readfirstlane_b32 s34, v0
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_0_1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX10-NEXT: v_readfirstlane_b32 s34, v0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_0_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 0, i32 1
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_1_3(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_1_3:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 0xa50, 10
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_1_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s34, 0xa50, 10
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_1_3:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, 0xa50, 10
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_1_3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, 0xa50, 10
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 1, i32 3
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define void @v_set_rounding_select_1_3(i32 %cond) {
-; GFX678-LABEL: v_set_rounding_select_1_3:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xa50
-; GFX678-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX678-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc
-; GFX678-NEXT: v_readfirstlane_b32 s4, v0
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_set_rounding_select_1_3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xa50
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 10, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_set_rounding_select_1_3:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_set_rounding_select_1_3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 10, 0xa50, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 1, i32 3
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
-; GFX6-LABEL: s_set_rounding_select_2_0:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_cmp_eq_u32 s4, 0
-; GFX6-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX6-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX6-NEXT: v_readfirstlane_b32 s34, v0
-; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_select_2_0:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_cmp_eq_u32 s4, 0
-; GFX7-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX7-NEXT: v_readfirstlane_b32 s34, v0
-; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_select_2_0:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_mov_b32 s34, 0xa50f
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s34
-; GFX8-NEXT: v_readfirstlane_b32 s34, v0
-; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_2_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_mov_b32 s34, 0xa50f
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s34
-; GFX9-NEXT: v_readfirstlane_b32 s34, v0
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_2_0:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX10-NEXT: v_readfirstlane_b32 s34, v0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_2_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 2, i32 0
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_2_1:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_movk_i32 s34, 0xa5
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_2_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_movk_i32 s34, 0xa5
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_2_1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_movk_i32 s34, 0xa5
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_2_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_movk_i32 s0, 0xa5
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 2, i32 1
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_1_2:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_movk_i32 s34, 0xa50
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa5
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_1_2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_movk_i32 s34, 0xa50
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa5
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_1_2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_movk_i32 s34, 0xa50
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa5
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_1_2:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_movk_i32 s0, 0xa50
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa5
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 1, i32 2
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_3_0:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 10, 0xa50f
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_3_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s34, 10, 0xa50f
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_3_0:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, 10, 0xa50f
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_3_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, 10, 0xa50f
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 3, i32 0
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_4_0:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX678-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX678-NEXT: v_readfirstlane_b32 s34, v0
-; GFX678-NEXT: s_lshl_b32 s34, s34, 2
-; GFX678-NEXT: s_add_i32 s35, s34, -4
-; GFX678-NEXT: s_min_u32 s34, s34, s35
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_4_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT: v_readfirstlane_b32 s34, v0
-; GFX9-NEXT: s_lshl_b32 s34, s34, 2
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_4_0:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT: v_readfirstlane_b32 s34, v0
-; GFX10-NEXT: s_lshl_b32 s34, s34, 2
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s34, s34, s35
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_4_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 4, i32 0
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_3_5:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 3, 5
-; GFX678-NEXT: s_add_i32 s35, s34, -4
-; GFX678-NEXT: s_min_u32 s34, s34, s35
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: s_set_rounding_select_3_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s34, 3, 5
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
-; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: s_set_rounding_select_3_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s34, 3, 5
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s34, s34, s35
-; GFX10-NEXT: s_lshl_b32 s36, s34, 2
-; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: s_set_rounding_select_3_5:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s0, 3, 5
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s2, s0, 2
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %cmp = icmp eq i32 %cond, 0
- %rounding = select i1 %cmp, i32 3, i32 5
- call void @llvm.set.rounding(i32 %rounding)
- ret void
-}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
>From e0cd6937c5cb48a8b73016eac6bb3922d2252022 Mon Sep 17 00:00:00 2001
From: akshaykumars614 <88362922+akshaykumars614 at users.noreply.github.com>
Date: Sat, 4 May 2024 14:16:02 -0400
Subject: [PATCH 04/10] llvm/lib/CodeGen/TargetSchedule.cpp:132:12: warning:
Assert statement modifies 'NIter' (#90982)
Modified the assert statement
---
llvm/lib/CodeGen/TargetSchedule.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index ce59b096992d8e..b5db2a671f7d04 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -129,7 +129,8 @@ resolveSchedClass(const MachineInstr *MI) const {
unsigned NIter = 0;
#endif
while (SCDesc->isVariant()) {
- assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+ ++NIter;
+ assert(NIter < 6 && "Variants are nested deeper than the magic number");
SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
SCDesc = SchedModel.getSchedClassDesc(SchedClass);
>From 80c1743bccc43d1ee202a91f251aeed053d97db7 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie at gmail.com>
Date: Sat, 4 May 2024 11:43:08 -0700
Subject: [PATCH 05/10] Revert "llvm/lib/CodeGen/TargetSchedule.cpp:132:12:
warning: Assert statement modifies 'NIter'" (#91079)
Reverts llvm/llvm-project#90982
NIter was only declared in !NDEBUG, and only used for assertions - so it
was correct that it was incremented inside the assertion. (& in fact now
the non-asserts build fails, because the variable is incremented even
though it isn't declared)
---
llvm/lib/CodeGen/TargetSchedule.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index b5db2a671f7d04..ce59b096992d8e 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -129,8 +129,7 @@ resolveSchedClass(const MachineInstr *MI) const {
unsigned NIter = 0;
#endif
while (SCDesc->isVariant()) {
- ++NIter;
- assert(NIter < 6 && "Variants are nested deeper than the magic number");
+ assert(++NIter < 6 && "Variants are nested deeper than the magic number");
SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
SCDesc = SchedModel.getSchedClassDesc(SchedClass);
>From 3093fced87e4c321bf9f2a32d89effd6912131d5 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sat, 4 May 2024 11:44:18 -0700
Subject: [PATCH 06/10] [SLP][NFC]Use std::optional::value_or.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 43cf0ce6c2b926..ea132a89d2002b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11647,7 +11647,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
return V;
return Builder.CreateIntCast(
V, VectorType::get(ScalarTy, VecTy->getElementCount()),
- IsSigned ? *IsSigned : !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
+ IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
}
public:
>From ad7499a15405922b5c292ec53d2fc07a6e3b103e Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy at amd.com>
Date: Sat, 4 May 2024 21:20:17 +0200
Subject: [PATCH 07/10] [flang][MLIR][OpenMP] Extend delayed privatization for
arrays and characters (#85023)
---
.../flang/Optimizer/Builder/HLFIRTools.h | 2 +-
.../lib/Lower/OpenMP/DataSharingProcessor.cpp | 34 ++++--
flang/lib/Lower/OpenMP/OpenMP.cpp | 9 +-
flang/lib/Optimizer/Builder/HLFIRTools.cpp | 15 +--
...elayed-privatization-allocatable-array.f90 | 67 ++++++++++++
.../OpenMP/delayed-privatization-array.f90 | 100 ++++++++++++++++++
.../delayed-privatization-character-array.f90 | 67 ++++++++++++
.../delayed-privatization-character.f90 | 59 +++++++++++
8 files changed, 336 insertions(+), 17 deletions(-)
create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-array.f90
create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-character.f90
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 6c36f7e84db688..cf7df38b1cdfbd 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -223,7 +223,7 @@ class EntityWithAttributes : public Entity {
using CleanupFunction = std::function<void()>;
std::pair<fir::ExtendedValue, std::optional<CleanupFunction>>
translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
- Entity entity);
+ Entity entity, bool contiguousHint = false);
/// Function to translate FortranVariableOpInterface to fir::ExtendedValue.
/// It may generates IR to unbox fir.boxchar, but has otherwise no side effects
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 2a418396cdafc4..3569a0f070beb9 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -15,6 +15,7 @@
#include "Utils.h"
#include "flang/Lower/PFTBuilder.h"
#include "flang/Lower/SymbolMap.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Semantics/tools.h"
@@ -438,8 +439,16 @@ void DataSharingProcessor::doPrivatize(
&allocRegion, /*insertPt=*/{}, symType, symLoc);
firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
- symTable->addSymbol(*sym,
- fir::substBase(symExV, allocRegion.getArgument(0)));
+
+ fir::ExtendedValue localExV =
+ hlfir::translateToExtendedValue(
+ symLoc, firOpBuilder, hlfir::Entity{allocRegion.getArgument(0)},
+ /*contiguousHint=*/
+ Fortran::evaluate::IsSimplyContiguous(
+ *sym, converter.getFoldingContext()))
+ .first;
+
+ symTable->addSymbol(*sym, localExV);
symTable->pushScope();
cloneSymbol(sym);
firOpBuilder.create<mlir::omp::YieldOp>(
@@ -456,12 +465,23 @@ void DataSharingProcessor::doPrivatize(
mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
©Region, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
- symTable->addSymbol(*sym,
- fir::substBase(symExV, copyRegion.getArgument(0)),
- /*force=*/true);
+
+ auto addSymbol = [&](unsigned argIdx, bool force = false) {
+ symExV.match(
+ [&](const fir::MutableBoxValue &box) {
+ symTable->addSymbol(
+ *sym, fir::substBase(box, copyRegion.getArgument(argIdx)),
+ force);
+ },
+ [&](const auto &box) {
+ symTable->addSymbol(*sym, copyRegion.getArgument(argIdx), force);
+ });
+ };
+
+ addSymbol(0, true);
symTable->pushScope();
- symTable->addSymbol(*sym,
- fir::substBase(symExV, copyRegion.getArgument(1)));
+ addSymbol(1);
+
auto ip = firOpBuilder.saveInsertionPoint();
copyFirstPrivateSymbol(sym, &ip);
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c54f100b73da3f..6758f980b8b279 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1439,13 +1439,18 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
reductionSyms;
allSymbols.append(privateSyms);
for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
- converter.bindSymbol(*arg, prv);
+ fir::ExtendedValue hostExV = converter.getSymbolExtendedValue(*arg);
+ converter.bindSymbol(*arg, hlfir::translateToExtendedValue(
+ loc, firOpBuilder, hlfir::Entity{prv},
+ /*contiguousHint=*/
+ Fortran::evaluate::IsSimplyContiguous(
+ *arg, converter.getFoldingContext()))
+ .first);
}
return allSymbols;
};
- // TODO Merge with the reduction CB.
genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
}
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 44779427ab557a..b32c3e50647e21 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -844,10 +844,9 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
return loopNest;
}
-static fir::ExtendedValue
-translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
- hlfir::Entity variable,
- bool forceHlfirBase = false) {
+static fir::ExtendedValue translateVariableToExtendedValue(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity variable,
+ bool forceHlfirBase = false, bool contiguousHint = false) {
assert(variable.isVariable() && "must be a variable");
/// When going towards FIR, use the original base value to avoid
/// introducing descriptors at runtime when they are not required.
@@ -858,7 +857,8 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
fir::MutableProperties{});
if (mlir::isa<fir::BaseBoxType>(base.getType())) {
- if (!variable.isSimplyContiguous() || variable.isPolymorphic() ||
+ bool contiguous = variable.isSimplyContiguous() || contiguousHint;
+ if (!contiguous || variable.isPolymorphic() ||
variable.isDerivedWithLengthParameters() || variable.isOptional()) {
llvm::SmallVector<mlir::Value> nonDefaultLbounds =
getNonDefaultLowerBounds(loc, builder, variable);
@@ -907,9 +907,10 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
- hlfir::Entity entity) {
+ hlfir::Entity entity, bool contiguousHint) {
if (entity.isVariable())
- return {translateVariableToExtendedValue(loc, builder, entity),
+ return {translateVariableToExtendedValue(loc, builder, entity, false,
+ contiguousHint),
std::nullopt};
if (entity.isProcedure()) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
new file mode 100644
index 00000000000000..47e163014fe868
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-array.f90
@@ -0,0 +1,67 @@
+! Test delayed privatization for allocatable arrays.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN: FileCheck %s
+
+subroutine delayed_privatization_private(var1, l1)
+ implicit none
+ integer(8):: l1
+ integer, allocatable, dimension(:) :: var1
+
+!$omp parallel firstprivate(var1)
+ var1(l1 + 1) = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<!fir.array<\?xi32>>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<{{\?}}xi32>>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+
+! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]]
+! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]]
+! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT: fir.if %[[ALLOC_COND]] {
+! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
+! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+! CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]]
+! CHECK-NEXT: fir.box_addr %[[PRIV_ARG_VAL]]
+! CHECK-NEXT: %[[C0_2:.*]] = arith.constant 0 : index
+! CHECK-NEXT: %[[CMP:.*]] = arith.cmpi sgt, %[[DIMS]]#1, %[[C0_2]] : index
+! CHECK-NEXT: %[[SELECT:.*]] = arith.select %[[CMP]], %[[DIMS]]#1, %[[C0_2]] : index
+! CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[SELECT]]
+! CHECK-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[SELECT]] : (index, index) -> !fir.shapeshift<1>
+! CHECK-NEXT: %[[EMBOX:.*]] = fir.embox %[[MEM]](%[[SHAPE_SHIFT]])
+! CHECK-NEXT: fir.store %[[EMBOX]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT: } else {
+! CHECK-NEXT: %[[ZEROS:.*]] = fir.zero_bits
+! CHECK-NEXT: %[[C0_3:.*]] = arith.constant 0 : index
+! CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C0_3]] : (index) -> !fir.shape<1>
+! CHECK-NEXT: %[[EMBOX_2:.*]] = fir.embox %[[ZEROS]](%[[SHAPE]])
+! CHECK-NEXT: fir.store %[[EMBOX_2]] to %[[PRIV_ALLOC]]
+! CHECK-NEXT: }
+
+! CHECK-NEXT: hlfir.declare
+! CHECK-NEXT: omp.yield
+
+! CHECK-NEXT: } copy {
+! CHECK-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT: %[[PRIV_BASE_VAL:.*]] = fir.load %[[PRIV_PRIV_ARG]]
+! CHECK-NEXT: %[[PRIV_BASE_BOX:.*]] = fir.box_addr %[[PRIV_BASE_VAL]]
+! CHECK-NEXT: %[[PRIV_BASE_ADDR:.*]] = fir.convert %[[PRIV_BASE_BOX]]
+! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT: %[[COPY_COND:.*]] = arith.cmpi ne, %[[PRIV_BASE_ADDR]], %[[C0]] : i64
+
+
+! CHECK-NEXT: fir.if %[[COPY_COND]] {
+! CHECK-NEXT: %[[PRIV_ORIG_ARG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+! CHECK-NEXT: hlfir.assign %[[PRIV_ORIG_ARG_VAL]] to %[[PRIV_BASE_VAL]] temporary_lhs
+! CHECK-NEXT: }
+! CHECK-NEXT: omp.yield
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
new file mode 100644
index 00000000000000..1d291b2ac0febd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90
@@ -0,0 +1,100 @@
+! Test delayed privatization for arrays.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/one_dim_array.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN: %t/one_dim_array.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/two_dim_array.f90 2>&1 | FileCheck %s --check-prefix=TWO_DIM
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN: %t/two_dim_array.f90 2>&1 | FileCheck %s --check-prefix=TWO_DIM
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/one_dim_array_default_lb.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM_DEFAULT_LB
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - \
+! RUN: %t/one_dim_array_default_lb.f90 2>&1 | FileCheck %s --check-prefix=ONE_DIM_DEFAULT_LB
+
+!--- one_dim_array.f90
+subroutine delayed_privatization_private_1d(var1, l1, u1)
+ implicit none
+ integer(8):: l1, u1
+ integer, dimension(l1:u1) :: var1
+
+!$omp parallel firstprivate(var1)
+ var1(l1 + 1) = 10
+!$omp end parallel
+end subroutine
+
+! ONE_DIM-LABEL: omp.private {type = firstprivate}
+! ONE_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?xi32>>]] alloc {
+
+! ONE_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! ONE_DIM: %[[C0:.*]] = arith.constant 0 : index
+! ONE_DIM-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
+! ONE_DIM: %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}xi32>
+! ONE_DIM-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
+! ONE_DIM-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_1dEvar1"}
+! ONE_DIM-NEXT: omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! ONE_DIM-NEXT: } copy {
+! ONE_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! ONE_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! ONE_DIM-NEXT: omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! ONE_DIM-NEXT: }
+
+!--- two_dim_array.f90
+subroutine delayed_privatization_private_2d(var1, l1, u1, l2, u2)
+ implicit none
+ integer(8):: l1, u1, l2, u2
+ integer, dimension(l1:u1, l2:u2) :: var1
+
+!$omp parallel firstprivate(var1)
+ var1(l1 + 1, u2) = 10
+!$omp end parallel
+end subroutine
+
+! TWO_DIM-LABEL: omp.private {type = firstprivate}
+! TWO_DIM-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x\?xi32>>]] alloc {
+
+! TWO_DIM-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! TWO_DIM: %[[C0:.*]] = arith.constant 0 : index
+! TWO_DIM-NEXT: %[[DIMS0:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]] : ([[TYPE]], index) -> (index, index, index)
+
+! TWO_DIM-NEXT: %[[C1:.*]] = arith.constant 1 : index
+! TWO_DIM-NEXT: %[[DIMS1:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C1]] : ([[TYPE]], index) -> (index, index, index)
+
+! TWO_DIM-NEXT: %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<{{\?}}x{{\?}}xi32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
+! TWO_DIM-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS0]]#0, %[[DIMS0]]#1, %[[DIMS1]]#0, %[[DIMS1]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+
+! TWO_DIM-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE_SHIFT]]) {uniq_name = "_QFdelayed_privatization_private_2dEvar1"}
+! TWO_DIM-NEXT: omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! TWO_DIM-NEXT: } copy {
+! TWO_DIM-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! TWO_DIM-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]] temporary_lhs
+! TWO_DIM-NEXT: omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! TWO_DIM-NEXT: }
+
+!--- one_dim_array_default_lb.f90
+program main
+ implicit none
+ integer, dimension(10) :: var1
+
+!$omp parallel private(var1)
+ var1(1) = 10
+!$omp end parallel
+end program
+
+! ONE_DIM_DEFAULT_LB-LABEL: omp.private {type = private}
+! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<10xi32>>]] alloc {
+
+! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! ONE_DIM_DEFAULT_LB: %[[C10:.*]] = arith.constant 10 : index
+! ONE_DIM_DEFAULT_LB: %[[PRIV_ALLOCA:.*]] = fir.alloca !fir.array<10xi32>
+! ONE_DIM_DEFAULT_LB: %[[SHAPE:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
+! ONE_DIM_DEFAULT_LB: hlfir.declare %[[PRIV_ALLOCA]](%[[SHAPE]])
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
new file mode 100644
index 00000000000000..9a9d0c01212c8d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-character-array.f90
@@ -0,0 +1,67 @@
+! Test delayed privatization for the `CHARACTER` array type.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/static_len.f90 2>&1 | FileCheck %s --check-prefix=STATIC_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/static_len.f90 2>&1 \
+! RUN: | FileCheck %s --check-prefix=STATIC_LEN
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/dyn_len.f90 2>&1 | FileCheck %s --check-prefix=DYN_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/dyn_len.f90 2>&1 \
+! RUN: | FileCheck %s --check-prefix=DYN_LEN
+
+!--- static_len.f90
+subroutine delayed_privatization_character_array_static_len(var1)
+ implicit none
+ character(len = 10) :: var1(5)
+
+!$omp parallel firstprivate(var1)
+ var1(1) = "test"
+!$omp end parallel
+end subroutine
+
+! STATIC_LEN-LABEL: omp.private {type = firstprivate}
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.array<5x!fir.char<1,10>>>]] alloc {
+
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-DAG: %[[C5:.*]] = arith.constant 5 : index
+! STATIC_LEN-DAG: %[[C10:.*]] = arith.constant 10 : index
+! STATIC_LEN-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<5x!fir.char<1,10>>
+! STATIC_LEN-NEXT: %[[ARRAY_SHAPE:.*]] = fir.shape %[[C5]]
+! STATIC_LEN-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[C10]]
+! STATIC_LEN-NEXT: omp.yield(%[[PRIV_DECL]]#0
+
+! STATIC_LEN-NEXT: } copy {
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+
+! STATIC_LEN-NEXT: omp.yield(%[[PRIV_PRIV_ARG]]
+! STATIC_LEN-NEXT: }
+
+!--- dyn_len.f90
+subroutine delayed_privatization_character_array_dynamic_len(var1, char_len, array_len)
+ implicit none
+ integer(8):: char_len
+ integer(8):: array_len
+ character(len = char_len) :: var1(array_len)
+
+!$omp parallel private(var1)
+ var1(1) = "test"
+!$omp end parallel
+end subroutine
+
+! DYN_LEN-LABEL: omp.private {type = private}
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.box<!fir.array<\?x!fir.char<1,\?>>>]] alloc {
+
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! DYN_LEN: %[[C0:.*]] = arith.constant 0 : index
+! DYN_LEN-NEXT: %[[BOX_DIM:.*]]:3 = fir.box_dims %[[PRIV_ARG]], %[[C0]]
+! DYN_LEN: %[[CHAR_LEN:.*]] = fir.box_elesize %[[PRIV_ARG]]
+! DYN_LEN-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<?x!fir.char<1,?>>(%[[CHAR_LEN]] : index)
+! DYN_LEN-NEXT: %[[ARRAY_SHAPE:.*]] = fir.shape
+! DYN_LEN-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[ARRAY_SHAPE]]) typeparams %[[CHAR_LEN]]
+
+! DYN_LEN-NEXT: omp.yield(%[[PRIV_DECL]]#0
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-character.f90 b/flang/test/Lower/OpenMP/delayed-privatization-character.f90
new file mode 100644
index 00000000000000..db678ab13bbe69
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-character.f90
@@ -0,0 +1,59 @@
+! Test delayed privatization for the `CHARACTER` type.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/dyn_len.f90 2>&1 | FileCheck %s --check-prefix=DYN_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/dyn_len.f90 2>&1 \
+! RUN: | FileCheck %s --check-prefix=DYN_LEN
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN: -o - %t/static_len.f90 2>&1 | FileCheck %s --check-prefix=STATIC_LEN
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %t/static_len.f90 2>&1 \
+! RUN: | FileCheck %s --check-prefix=STATIC_LEN
+
+!--- dyn_len.f90
+subroutine delayed_privatization_character(var1, l)
+ implicit none
+ integer(8):: l
+ character(len = l) :: var1
+
+!$omp parallel firstprivate(var1)
+ var1 = "test"
+!$omp end parallel
+end subroutine
+
+! DYN_LEN-LABEL: omp.private {type = firstprivate}
+! DYN_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.boxchar<1>]] alloc {
+
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! DYN_LEN-NEXT: %[[UNBOX:.*]]:2 = fir.unboxchar %[[PRIV_ARG]]
+! DYN_LEN: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,?>(%[[UNBOX]]#1 : index)
+! DYN_LEN-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[UNBOX]]#1
+! DYN_LEN-NEXT: omp.yield(%[[PRIV_DECL]]#0 : !fir.boxchar<1>)
+
+! DYN_LEN-NEXT: } copy {
+! DYN_LEN-NEXT: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+
+! DYN_LEN-NEXT: hlfir.assign %[[PRIV_ORIG_ARG]] to %[[PRIV_PRIV_ARG]]
+
+! DYN_LEN-NEXT: omp.yield(%[[PRIV_PRIV_ARG]] : !fir.boxchar<1>)
+! DYN_LEN-NEXT: }
+
+!--- static_len.f90
+subroutine delayed_privatization_character_static_len(var1)
+ implicit none
+ character(len = 10) :: var1
+
+!$omp parallel private(var1)
+ var1 = "test"
+!$omp end parallel
+end subroutine
+
+! STATIC_LEN-LABEL: omp.private {type = private}
+! STATIC_LEN-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.char<1,10>>]] alloc {
+
+! STATIC_LEN-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! STATIC_LEN-NEXT: %[[C10:.*]] = arith.constant 10 : index
+! STATIC_LEN-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.char<1,10>
+! STATIC_LEN-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] typeparams %[[C10]]
>From f4f56027a39b7782518c9a6a3b4e85ac8edd63df Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Sat, 4 May 2024 12:33:12 -0700
Subject: [PATCH 08/10] [Transforms] Use StringRef::operator== instead of
StringRef::equals (NFC) (#91072)
I'm planning to remove StringRef::equals in favor of
StringRef::operator==.
- StringRef::operator==/!= outnumber StringRef::equals by a factor of
31 under llvm/ in terms of their usage.
- The elimination of StringRef::equals brings StringRef closer to
std::string_view, which has operator== but not equals.
- S == "foo" is more readable than S.equals("foo"), especially for
!Long.Expression.equals("str") vs Long.Expression != "str".
---
llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 5 ++--
llvm/lib/Transforms/IPO/BlockExtractor.cpp | 5 ++--
.../Instrumentation/PGOInstrumentation.cpp | 4 +--
llvm/lib/Transforms/Scalar/ADCE.cpp | 2 +-
.../lib/Transforms/Scalar/PlaceSafepoints.cpp | 2 +-
.../Scalar/RewriteStatepointsForGC.cpp | 4 +--
llvm/lib/Transforms/Utils/LoopUnroll.cpp | 2 +-
llvm/lib/Transforms/Utils/LoopUtils.cpp | 2 +-
llvm/lib/Transforms/Utils/SymbolRewriter.cpp | 26 +++++++++----------
llvm/unittests/Transforms/Utils/LocalTest.cpp | 2 +-
10 files changed, 26 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 3a43b1edcaba37..4eb6e75d09fa53 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1846,11 +1846,10 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
auto ProjectionFunctionName =
Suspend->getAsyncContextProjectionFunction()->getName();
bool UseSwiftMangling = false;
- if (ProjectionFunctionName.equals("__swift_async_resume_project_context")) {
+ if (ProjectionFunctionName == "__swift_async_resume_project_context") {
ResumeNameSuffix = "TQ";
UseSwiftMangling = true;
- } else if (ProjectionFunctionName.equals(
- "__swift_async_resume_get_context")) {
+ } else if (ProjectionFunctionName == "__swift_async_resume_get_context") {
ResumeNameSuffix = "TY";
UseSwiftMangling = true;
}
diff --git a/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index 0c406aa9822e73..ec1be35a331640 100644
--- a/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -142,9 +142,8 @@ bool BlockExtractor::runOnModule(Module &M) {
report_fatal_error("Invalid function name specified in the input file",
/*GenCrashDiag=*/false);
for (const auto &BBInfo : BInfo.second) {
- auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
- return BB.getName().equals(BBInfo);
- });
+ auto Res = llvm::find_if(
+ *F, [&](const BasicBlock &BB) { return BB.getName() == BBInfo; });
if (Res == F->end())
report_fatal_error("Invalid block name specified in the input file",
/*GenCrashDiag=*/false);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index b333b1582e802c..2269c2e0fffae9 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -2125,7 +2125,7 @@ static bool annotateAllFunctions(
HotFunctions.push_back(&F);
if (PGOViewCounts != PGOVCT_None &&
(ViewBlockFreqFuncName.empty() ||
- F.getName().equals(ViewBlockFreqFuncName))) {
+ F.getName() == ViewBlockFreqFuncName)) {
LoopInfo LI{DominatorTree(F)};
std::unique_ptr<BranchProbabilityInfo> NewBPI =
std::make_unique<BranchProbabilityInfo>(F, LI);
@@ -2140,7 +2140,7 @@ static bool annotateAllFunctions(
}
if (PGOViewRawCounts != PGOVCT_None &&
(ViewBlockFreqFuncName.empty() ||
- F.getName().equals(ViewBlockFreqFuncName))) {
+ F.getName() == ViewBlockFreqFuncName)) {
if (PGOViewRawCounts == PGOVCT_Graph)
if (ViewBlockFreqFuncName.empty())
WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 96ecd7f368a000..5f0a9b22c3ee7b 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -350,7 +350,7 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
// TODO -- move this test into llvm::isInstructionTriviallyDead
if (CallInst *CI = dyn_cast<CallInst>(&I))
if (Function *Callee = CI->getCalledFunction())
- if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+ if (Callee->getName() == getInstrProfValueProfFuncName())
if (isa<Constant>(CI->getArgOperand(0)))
return true;
return false;
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index f5c9aaa4f20bc7..77d155d7e78e3d 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -591,7 +591,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,
const char GCSafepointPollName[] = "gc.safepoint_poll";
static bool isGCSafepointPoll(Function &F) {
- return F.getName().equals(GCSafepointPollName);
+ return F.getName() == GCSafepointPollName;
}
/// Returns true if this function should be rewritten to include safepoint
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 330b464667ee46..286273c897aac1 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1685,10 +1685,10 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
// Pass through the requested lowering if any. The default is live-through.
StringRef DeoptLowering = getDeoptLowering(Call);
- if (DeoptLowering.equals("live-in"))
+ if (DeoptLowering == "live-in")
Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
else {
- assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+ assert(DeoptLowering == "live-through" && "Unsupported value!");
}
FunctionCallee CallTarget(Call->getFunctionType(), Call->getCalledOperand());
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index e9969ae64147d8..20978cf2e748ab 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1077,7 +1077,7 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
if (!S)
continue;
- if (Name.equals(S->getString()))
+ if (Name == S->getString())
return MD;
}
return nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index f783b708fa9eba..cc883a7dc2927a 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -222,7 +222,7 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
// If it is of form key = value, try to parse it.
if (Node->getNumOperands() == 2) {
MDString *S = dyn_cast<MDString>(Node->getOperand(0));
- if (S && S->getString().equals(StringMD)) {
+ if (S && S->getString() == StringMD) {
ConstantInt *IntMD =
mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
if (IntMD && IntMD->getSExtValue() == V)
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 8b4f34209e8530..d52d52a9b7d3ec 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -308,11 +308,11 @@ bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
}
RewriteType = Key->getValue(KeyStorage);
- if (RewriteType.equals("function"))
+ if (RewriteType == "function")
return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
- else if (RewriteType.equals("global variable"))
+ else if (RewriteType == "global variable")
return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
- else if (RewriteType.equals("global alias"))
+ else if (RewriteType == "global alias")
return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
YS.printError(Entry.getKey(), "unknown rewrite type");
@@ -348,7 +348,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
}
KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
+ if (KeyValue == "source") {
std::string Error;
Source = std::string(Value->getValue(ValueStorage));
@@ -356,11 +356,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
YS.printError(Field.getKey(), "invalid regex: " + Error);
return false;
}
- } else if (KeyValue.equals("target")) {
+ } else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
+ } else if (KeyValue == "transform") {
Transform = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("naked")) {
+ } else if (KeyValue == "naked") {
std::string Undecorated;
Undecorated = std::string(Value->getValue(ValueStorage));
@@ -417,7 +417,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
}
KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
+ if (KeyValue == "source") {
std::string Error;
Source = std::string(Value->getValue(ValueStorage));
@@ -425,9 +425,9 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
YS.printError(Field.getKey(), "invalid regex: " + Error);
return false;
}
- } else if (KeyValue.equals("target")) {
+ } else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
+ } else if (KeyValue == "transform") {
Transform = std::string(Value->getValue(ValueStorage));
} else {
YS.printError(Field.getKey(), "unknown Key for Global Variable");
@@ -480,7 +480,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
}
KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
+ if (KeyValue == "source") {
std::string Error;
Source = std::string(Value->getValue(ValueStorage));
@@ -488,9 +488,9 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
YS.printError(Field.getKey(), "invalid regex: " + Error);
return false;
}
- } else if (KeyValue.equals("target")) {
+ } else if (KeyValue == "target") {
Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
+ } else if (KeyValue == "transform") {
Transform = std::string(Value->getValue(ValueStorage));
} else {
YS.printError(Field.getKey(), "unknown key for Global Alias");
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index b871603328b202..b28ba2b1b4461d 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1196,7 +1196,7 @@ TEST(Local, SimplifyCFGWithNullAC) {
// Obtain BasicBlock of interest to this test, %test.bb.
BasicBlock *TestBB = nullptr;
for (BasicBlock &BB : F) {
- if (BB.getName().equals("test.bb")) {
+ if (BB.getName() == "test.bb") {
TestBB = &BB;
break;
}
>From b776f4a65219a56e7bfc433c26fec0e9aad9c27a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 4 May 2024 20:53:53 +0100
Subject: [PATCH 09/10] [LV,LAA] Don't vectorize loops with load and store to
invar address.
Code checking stores to invariant addresses and reductions made an
incorrect assumption that the case of both a load & store to the same
invariant address does not need to be handled.
In some cases when vectorizing with runtime checks, there may be
dependences with a load and store to the same address, storing a
reduction value.
Update LAA to separately track if there was a store-store and a
load-store dependence with an invariant addresses.
Bail out early if there as a load-store dependence with invariant
address. If there was a store-store one, still apply the logic checking
if they all store a reduction.
---
.../llvm/Analysis/LoopAccessAnalysis.h | 28 ++++++++++++++-----
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 14 ++++++----
.../Vectorize/LoopVectorizationLegality.cpp | 16 ++++++++---
.../reduction-with-invariant-store.ll | 3 +-
4 files changed, 43 insertions(+), 18 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index e39c371b41ec5c..1d67a71f43edde 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -579,7 +579,11 @@ class LoopAccessInfo {
AAResults *AA, DominatorTree *DT, LoopInfo *LI);
/// Return true we can analyze the memory accesses in the loop and there are
- /// no memory dependence cycles.
+ /// no memory dependence cycles. Note that for dependences between loads &
+ /// stores with uniform addresses,
+ /// hasStoreStoreDependenceInvolvingLoopInvariantAddress and
+ /// hasLoadStoreDependenceInvolvingLoopInvariantAddress also need to be
+ /// checked.
bool canVectorizeMemory() const { return CanVecMem; }
/// Return true if there is a convergent operation in the loop. There may
@@ -632,10 +636,16 @@ class LoopAccessInfo {
/// Print the information about the memory accesses in the loop.
void print(raw_ostream &OS, unsigned Depth = 0) const;
- /// If the loop has memory dependence involving an invariant address, i.e. two
- /// stores or a store and a load, then return true, else return false.
- bool hasDependenceInvolvingLoopInvariantAddress() const {
- return HasDependenceInvolvingLoopInvariantAddress;
+ /// Return true if the loop has memory dependence involving two stores to an
+ /// invariant address, else return false.
+ bool hasStoreStoreDependenceInvolvingLoopInvariantAddress() const {
+ return HasStoreStoreDependenceInvolvingLoopInvariantAddress;
+ }
+
+ /// Return true if the loop has memory dependence involving a load and a store
+ /// to an invariant address, else return false.
+ bool hasLoadStoreDependenceInvolvingLoopInvariantAddress() const {
+ return HasLoadStoreDependenceInvolvingLoopInvariantAddress;
}
/// Return the list of stores to invariant addresses.
@@ -697,8 +707,12 @@ class LoopAccessInfo {
bool CanVecMem = false;
bool HasConvergentOp = false;
- /// Indicator that there are non vectorizable stores to a uniform address.
- bool HasDependenceInvolvingLoopInvariantAddress = false;
+ /// Indicator that there are two non vectorizable stores to the same uniform
+ /// address.
+ bool HasStoreStoreDependenceInvolvingLoopInvariantAddress = false;
+ /// Indicator that there is non vectorizable load and store to the same
+ /// uniform address.
+ bool HasLoadStoreDependenceInvolvingLoopInvariantAddress = false;
/// List of stores to invariant addresses.
SmallVector<StoreInst *> StoresToInvariantAddresses;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b0d29e2409f762..fc86523d3146f4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2537,7 +2537,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
if (isInvariant(Ptr)) {
// Record store instructions to loop invariant addresses
StoresToInvariantAddresses.push_back(ST);
- HasDependenceInvolvingLoopInvariantAddress |=
+ HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
!UniformStores.insert(Ptr).second;
}
@@ -2593,7 +2593,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
if (UniformStores.count(Ptr)) {
LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
"load and uniform store to the same address!\n");
- HasDependenceInvolvingLoopInvariantAddress = true;
+ HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
}
MemoryLocation Loc = MemoryLocation::get(LD);
@@ -3057,9 +3057,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
PtrRtChecking->print(OS, Depth);
OS << "\n";
- OS.indent(Depth) << "Non vectorizable stores to invariant address were "
- << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
- << "found in loop.\n";
+ OS.indent(Depth)
+ << "Non vectorizable stores to invariant address were "
+ << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
+ HasLoadStoreDependenceInvolvingLoopInvariantAddress
+ ? ""
+ : "not ")
+ << "found in loop.\n";
OS.indent(Depth) << "SCEV assumptions:\n";
PSE->getPredicate().print(OS, Depth);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index d33743e74cbe31..9de49d1bcfeaca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1067,6 +1067,15 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
if (!LAI->canVectorizeMemory())
return false;
+ if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
+ reportVectorizationFailure("We don't allow storing to uniform addresses",
+ "write to a loop invariant address could not "
+ "be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE,
+ TheLoop);
+ return false;
+ }
+
// We can vectorize stores to invariant address when final reduction value is
// guaranteed to be stored at the end of the loop. Also, if decision to
// vectorize loop is made, runtime checks are added so as to make sure that
@@ -1102,13 +1111,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}
}
- if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
// For each invariant address, check its last stored value is the result
// of one of our reductions.
//
- // We do not check if dependence with loads exists because they are
- // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
- // behaviour changes we have to modify this code.
+ // We do not check if dependence with loads exists because that is already
+ // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
ScalarEvolution *SE = PSE.getSE();
SmallVector<StoreInst *, 4> UnhandledStores;
for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 2eda6874209497..8cf4e77a0d4990 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -123,9 +123,8 @@ exit:
; @reduc_store_load with a non-constant dependence distance, resulting in
; vectorization with runtime checks.
;
-; FIXME: currently this gets vectorized incorrectly.
; CHECK-LABEL: @reduc_store_load_with_non_constant_distance_dependence
-; CHECK: vector.body:
+; CHECK-NOT: vector.body:
define void @reduc_store_load_with_non_constant_distance_dependence(ptr %dst, ptr noalias %dst.2, i64 %off) {
entry:
%gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
>From b62d58400e39eec0462105dac8c126cb39736160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20B=C3=B6ck?= <markus.boeck02 at gmail.com>
Date: Sat, 4 May 2024 20:57:22 +0100
Subject: [PATCH 10/10] [mlir][ODS][NFC] Deduplicate `ref` and `qualified`
handling (#91080)
Both the attribute and type format generator and the op format generator
independently implemented the parsing and verification of the `ref` and
`qualified` directives with little to no differences.
This PR moves the implementation of these into the common `FormatParser`
class to deduplicate the implementations.
---
.../attr-or-type-format-invalid.td | 2 +-
.../tools/mlir-tblgen/AttrOrTypeFormatGen.cpp | 52 ++++---------------
mlir/tools/mlir-tblgen/FormatGen.cpp | 36 +++++++++++++
mlir/tools/mlir-tblgen/FormatGen.h | 10 +++-
mlir/tools/mlir-tblgen/OpFormatGen.cpp | 40 ++------------
5 files changed, 61 insertions(+), 79 deletions(-)
diff --git a/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td b/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
index d3be4d8b8022a0..3a57cbca4d7bb7 100644
--- a/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
+++ b/mlir/test/mlir-tblgen/attr-or-type-format-invalid.td
@@ -111,7 +111,7 @@ def InvalidTypeN : InvalidType<"InvalidTypeN", "invalid_n"> {
def InvalidTypeO : InvalidType<"InvalidTypeO", "invalid_o"> {
let parameters = (ins "int":$a);
- // CHECK: `ref` is only allowed inside custom directives
+ // CHECK: 'ref' is only valid within a `custom` directive
let assemblyFormat = "$a ref($a)";
}
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
index 6098808c646f76..abd1fbdaf8c649 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
@@ -940,6 +940,8 @@ class DefFormatParser : public FormatParser {
ArrayRef<FormatElement *> elements,
FormatElement *anchor) override;
+ LogicalResult markQualified(SMLoc loc, FormatElement *element) override;
+
/// Parse an attribute or type variable.
FailureOr<FormatElement *> parseVariableImpl(SMLoc loc, StringRef name,
Context ctx) override;
@@ -950,12 +952,8 @@ class DefFormatParser : public FormatParser {
private:
/// Parse a `params` directive.
FailureOr<FormatElement *> parseParamsDirective(SMLoc loc, Context ctx);
- /// Parse a `qualified` directive.
- FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc, Context ctx);
/// Parse a `struct` directive.
FailureOr<FormatElement *> parseStructDirective(SMLoc loc, Context ctx);
- /// Parse a `ref` directive.
- FailureOr<FormatElement *> parseRefDirective(SMLoc loc, Context ctx);
/// Attribute or type tablegen def.
const AttrOrTypeDef &def;
@@ -1060,6 +1058,14 @@ DefFormatParser::verifyOptionalGroupElements(llvm::SMLoc loc,
return success();
}
+LogicalResult DefFormatParser::markQualified(SMLoc loc,
+ FormatElement *element) {
+ if (!isa<ParameterElement>(element))
+ return emitError(loc, "`qualified` argument list expected a variable");
+ cast<ParameterElement>(element)->setShouldBeQualified();
+ return success();
+}
+
FailureOr<DefFormat> DefFormatParser::parse() {
FailureOr<std::vector<FormatElement *>> elements = FormatParser::parse();
if (failed(elements))
@@ -1107,33 +1113,11 @@ DefFormatParser::parseDirectiveImpl(SMLoc loc, FormatToken::Kind kind,
return parseParamsDirective(loc, ctx);
case FormatToken::kw_struct:
return parseStructDirective(loc, ctx);
- case FormatToken::kw_ref:
- return parseRefDirective(loc, ctx);
- case FormatToken::kw_custom:
- return parseCustomDirective(loc, ctx);
-
default:
return emitError(loc, "unsupported directive kind");
}
}
-FailureOr<FormatElement *>
-DefFormatParser::parseQualifiedDirective(SMLoc loc, Context ctx) {
- if (failed(parseToken(FormatToken::l_paren,
- "expected '(' before argument list")))
- return failure();
- FailureOr<FormatElement *> var = parseElement(ctx);
- if (failed(var))
- return var;
- if (!isa<ParameterElement>(*var))
- return emitError(loc, "`qualified` argument list expected a variable");
- cast<ParameterElement>(*var)->setShouldBeQualified();
- if (failed(
- parseToken(FormatToken::r_paren, "expected ')' after argument list")))
- return failure();
- return var;
-}
-
FailureOr<FormatElement *> DefFormatParser::parseParamsDirective(SMLoc loc,
Context ctx) {
// It doesn't make sense to allow references to all parameters in a custom
@@ -1201,22 +1185,6 @@ FailureOr<FormatElement *> DefFormatParser::parseStructDirective(SMLoc loc,
return create<StructDirective>(std::move(vars));
}
-FailureOr<FormatElement *> DefFormatParser::parseRefDirective(SMLoc loc,
- Context ctx) {
- if (ctx != CustomDirectiveContext)
- return emitError(loc, "`ref` is only allowed inside custom directives");
-
- // Parse the child parameter element.
- FailureOr<FormatElement *> child;
- if (failed(parseToken(FormatToken::l_paren, "expected '('")) ||
- failed(child = parseElement(RefDirectiveContext)) ||
- failed(parseToken(FormatToken::r_paren, "expeced ')'")))
- return failure();
-
- // Only parameter elements are allowed to be parsed under a `ref` directive.
- return create<RefDirective>(*child);
-}
-
//===----------------------------------------------------------------------===//
// Interface
//===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp
index d402748b96ad5f..7540e584b8fac5 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/FormatGen.cpp
@@ -308,6 +308,10 @@ FailureOr<FormatElement *> FormatParser::parseDirective(Context ctx) {
if (tok.is(FormatToken::kw_custom))
return parseCustomDirective(loc, ctx);
+ if (tok.is(FormatToken::kw_ref))
+ return parseRefDirective(loc, ctx);
+ if (tok.is(FormatToken::kw_qualified))
+ return parseQualifiedDirective(loc, ctx);
return parseDirectiveImpl(loc, tok.getKind(), ctx);
}
@@ -430,6 +434,38 @@ FailureOr<FormatElement *> FormatParser::parseCustomDirective(SMLoc loc,
return create<CustomDirective>(nameTok->getSpelling(), std::move(arguments));
}
+FailureOr<FormatElement *> FormatParser::parseRefDirective(SMLoc loc,
+ Context context) {
+ if (context != CustomDirectiveContext)
+ return emitError(loc, "'ref' is only valid within a `custom` directive");
+
+ FailureOr<FormatElement *> arg;
+ if (failed(parseToken(FormatToken::l_paren,
+ "expected '(' before argument list")) ||
+ failed(arg = parseElement(RefDirectiveContext)) ||
+ failed(
+ parseToken(FormatToken::r_paren, "expected ')' after argument list")))
+ return failure();
+
+ return create<RefDirective>(*arg);
+}
+
+FailureOr<FormatElement *> FormatParser::parseQualifiedDirective(SMLoc loc,
+ Context ctx) {
+ if (failed(parseToken(FormatToken::l_paren,
+ "expected '(' before argument list")))
+ return failure();
+ FailureOr<FormatElement *> var = parseElement(ctx);
+ if (failed(var))
+ return var;
+ if (failed(markQualified(loc, *var)))
+ return failure();
+ if (failed(
+ parseToken(FormatToken::r_paren, "expected ')' after argument list")))
+ return failure();
+ return var;
+}
+
//===----------------------------------------------------------------------===//
// Utility Functions
//===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/FormatGen.h b/mlir/tools/mlir-tblgen/FormatGen.h
index 18a410277fc108..b061d4d8ea7f03 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.h
+++ b/mlir/tools/mlir-tblgen/FormatGen.h
@@ -495,9 +495,12 @@ class FormatParser {
FailureOr<FormatElement *> parseDirective(Context ctx);
/// Parse an optional group.
FailureOr<FormatElement *> parseOptionalGroup(Context ctx);
-
/// Parse a custom directive.
FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx);
+ /// Parse a ref directive.
+ FailureOr<FormatElement *> parseRefDirective(SMLoc loc, Context context);
+ /// Parse a qualified directive.
+ FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc, Context ctx);
/// Parse a format-specific variable kind.
virtual FailureOr<FormatElement *>
@@ -522,6 +525,11 @@ class FormatParser {
ArrayRef<FormatElement *> elements,
FormatElement *anchor) = 0;
+ /// Mark 'element' as qualified. If 'element' cannot be qualified an error
+ /// should be emitted and failure returned.
+ virtual LogicalResult markQualified(llvm::SMLoc loc,
+ FormatElement *element) = 0;
+
//===--------------------------------------------------------------------===//
// Lexer Utilities
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 806991035e6685..f7cc0a292b8c53 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -2547,6 +2547,8 @@ class OpFormatParser : public FormatParser {
LogicalResult verifyOptionalGroupElement(SMLoc loc, FormatElement *element,
bool isAnchor);
+ LogicalResult markQualified(SMLoc loc, FormatElement *element) override;
+
/// Parse an operation variable.
FailureOr<FormatElement *> parseVariableImpl(SMLoc loc, StringRef name,
Context ctx) override;
@@ -2622,10 +2624,6 @@ class OpFormatParser : public FormatParser {
FailureOr<FormatElement *> parseOIListDirective(SMLoc loc, Context context);
LogicalResult verifyOIListParsingElement(FormatElement *element, SMLoc loc);
FailureOr<FormatElement *> parseOperandsDirective(SMLoc loc, Context context);
- FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc,
- Context context);
- FailureOr<FormatElement *> parseReferenceDirective(SMLoc loc,
- Context context);
FailureOr<FormatElement *> parseRegionsDirective(SMLoc loc, Context context);
FailureOr<FormatElement *> parseResultsDirective(SMLoc loc, Context context);
FailureOr<FormatElement *> parseSuccessorsDirective(SMLoc loc,
@@ -3224,16 +3222,12 @@ OpFormatParser::parseDirectiveImpl(SMLoc loc, FormatToken::Kind kind,
return parseFunctionalTypeDirective(loc, ctx);
case FormatToken::kw_operands:
return parseOperandsDirective(loc, ctx);
- case FormatToken::kw_qualified:
- return parseQualifiedDirective(loc, ctx);
case FormatToken::kw_regions:
return parseRegionsDirective(loc, ctx);
case FormatToken::kw_results:
return parseResultsDirective(loc, ctx);
case FormatToken::kw_successors:
return parseSuccessorsDirective(loc, ctx);
- case FormatToken::kw_ref:
- return parseReferenceDirective(loc, ctx);
case FormatToken::kw_type:
return parseTypeDirective(loc, ctx);
case FormatToken::kw_oilist:
@@ -3338,22 +3332,6 @@ OpFormatParser::parseOperandsDirective(SMLoc loc, Context context) {
return create<OperandsDirective>();
}
-FailureOr<FormatElement *>
-OpFormatParser::parseReferenceDirective(SMLoc loc, Context context) {
- if (context != CustomDirectiveContext)
- return emitError(loc, "'ref' is only valid within a `custom` directive");
-
- FailureOr<FormatElement *> arg;
- if (failed(parseToken(FormatToken::l_paren,
- "expected '(' before argument list")) ||
- failed(arg = parseElement(RefDirectiveContext)) ||
- failed(
- parseToken(FormatToken::r_paren, "expected ')' after argument list")))
- return failure();
-
- return create<RefDirective>(*arg);
-}
-
FailureOr<FormatElement *>
OpFormatParser::parseRegionsDirective(SMLoc loc, Context context) {
if (context == TypeDirectiveContext)
@@ -3495,19 +3473,11 @@ FailureOr<FormatElement *> OpFormatParser::parseTypeDirective(SMLoc loc,
return create<TypeDirective>(*operand);
}
-FailureOr<FormatElement *>
-OpFormatParser::parseQualifiedDirective(SMLoc loc, Context context) {
- FailureOr<FormatElement *> element;
- if (failed(parseToken(FormatToken::l_paren,
- "expected '(' before argument list")) ||
- failed(element = parseElement(context)) ||
- failed(
- parseToken(FormatToken::r_paren, "expected ')' after argument list")))
- return failure();
- return TypeSwitch<FormatElement *, FailureOr<FormatElement *>>(*element)
+LogicalResult OpFormatParser::markQualified(SMLoc loc, FormatElement *element) {
+ return TypeSwitch<FormatElement *, LogicalResult>(element)
.Case<AttributeVariable, TypeDirective>([](auto *element) {
element->setShouldBeQualified();
- return element;
+ return success();
})
.Default([&](auto *element) {
return this->emitError(
More information about the flang-commits
mailing list