[llvm] [GlobalISel][AMDGPU] Implement FPInfo for LLT (PR #122503)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 10 09:46:45 PST 2025
https://github.com/tgymnich created https://github.com/llvm/llvm-project/pull/122503
draft implementation of floating point support for LLT.
### TODO
- [ ] syntactic sugar for legalizer DSL to allow for easy type comparison based on size.
- [ ] fix remainder of broken tests
- [ ] update MIR tests
- [ ] improve type inference pass for easy updating of MIR tests
>From f21f6fddc7265a6c251e417b62b61e505f04365b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Fri, 4 Oct 2024 09:20:06 +0000
Subject: [PATCH 01/12] apply llt-float.patch
---
llvm/include/llvm/CodeGen/LowLevelTypeUtils.h | 8 +-
llvm/include/llvm/CodeGenTypes/LowLevelType.h | 346 +++++++++++++-----
llvm/lib/CodeGen/LowLevelTypeUtils.cpp | 81 +++-
llvm/lib/CodeGenTypes/LowLevelType.cpp | 75 +++-
4 files changed, 375 insertions(+), 135 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h b/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
index 142e5cd4e7ad17..af0e436e21166f 100644
--- a/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
+++ b/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
@@ -26,20 +26,20 @@ class Type;
struct fltSemantics;
/// Construct a low-level type based on an LLVM type.
-LLT getLLTForType(Type &Ty, const DataLayout &DL);
+LLT getLLTForType(Type &Ty, const DataLayout &DL, bool EnableFPInfo = false);
/// Get a rough equivalent of an MVT for a given LLT. MVT can't distinguish
/// pointers, so these will convert to a plain integer.
-MVT getMVTForLLT(LLT Ty);
+MVT getMVTForLLT(LLT Ty, bool EnableFPInfo = false);
EVT getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx);
/// Get a rough equivalent of an LLT for a given MVT. LLT does not yet support
/// scalarable vector types, and will assert if used.
-LLT getLLTForMVT(MVT Ty);
+LLT getLLTForMVT(MVT Ty, bool EnableFPInfo = false);
/// Get the appropriate floating point arithmetic semantic based on the bit size
/// of the given scalar LLT.
const llvm::fltSemantics &getFltSemanticForLLT(LLT Ty);
-}
+} // namespace llvm
#endif // LLVM_CODEGEN_LOWLEVELTYPEUTILS_H
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 06879e1f8d15b0..cf5f740c364d39 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -28,78 +28,144 @@
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <cassert>
namespace llvm {
+extern cl::opt<bool> EnableFPInfo;
+
class Type;
class raw_ostream;
class LLT {
public:
+ enum class FPInfo {
+ IEEE_FLOAT = 0x0,
+ VARIANT_FLOAT_1 = 0x1,
+ VARIANT_FLOAT_2 = 0x2,
+ VARIANT_FLOAT_3 = 0x3,
+ };
+
+ enum class Kind : uint64_t {
+ INVALID = 0b000,
+ INTEGER = 0b001,
+ FLOAT = 0b010,
+ POINTER = 0b011,
+ VECTOR_INTEGER = 0b101,
+ VECTOR_FLOAT = 0b110,
+ VECTOR_POINTER = 0b111,
+ };
+
+ constexpr static Kind toVector(Kind Ty) {
+ if (Ty == Kind::POINTER)
+ return Kind::VECTOR_POINTER;
+
+ if (Ty == Kind::INTEGER)
+ return Kind::VECTOR_INTEGER;
+
+ if (Ty == Kind::FLOAT)
+ return Kind::VECTOR_FLOAT;
+
+ assert(false && "Type is already a vector type");
+ return Ty;
+ }
+
+ constexpr static Kind toScalar(Kind Ty) {
+ if (Ty == Kind::VECTOR_POINTER)
+ return Kind::POINTER;
+
+ if (Ty == Kind::VECTOR_INTEGER)
+ return Kind::INTEGER;
+
+ if (Ty == Kind::VECTOR_FLOAT)
+ return Kind::FLOAT;
+
+ assert(false && "Type is already a scalar type");
+ return Ty;
+ }
+
/// Get a low-level scalar or aggregate "bag of bits".
+ [[deprecated("Use LLT::integer(unsigned) instead.")]]
static constexpr LLT scalar(unsigned SizeInBits) {
- return LLT{/*isPointer=*/false, /*isVector=*/false, /*isScalar=*/true,
- ElementCount::getFixed(0), SizeInBits,
- /*AddressSpace=*/0};
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
+ }
+
+ static constexpr LLT integer(unsigned SizeInBits) {
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
+ }
+
+ static constexpr LLT floatingPoint(unsigned SizeInBits, FPInfo FP) {
+ return LLT{Kind::FLOAT, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, FP};
}
/// Get a low-level token; just a scalar with zero bits (or no size).
static constexpr LLT token() {
- return LLT{/*isPointer=*/false, /*isVector=*/false,
- /*isScalar=*/true, ElementCount::getFixed(0),
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0),
/*SizeInBits=*/0,
- /*AddressSpace=*/0};
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
}
/// Get a low-level pointer in the given address space.
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
assert(SizeInBits > 0 && "invalid pointer size");
- return LLT{/*isPointer=*/true, /*isVector=*/false, /*isScalar=*/false,
- ElementCount::getFixed(0), SizeInBits, AddressSpace};
+ return LLT{Kind::POINTER, ElementCount::getFixed(0), SizeInBits,
+ AddressSpace, static_cast<FPInfo>(0)};
}
/// Get a low-level vector of some number of elements and element width.
+ [[deprecated("Use LLT::vector(EC, LLT) instead.")]]
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits) {
assert(!EC.isScalar() && "invalid number of vector elements");
- return LLT{/*isPointer=*/false, /*isVector=*/true, /*isScalar=*/false,
- EC, ScalarSizeInBits, /*AddressSpace=*/0};
+ return LLT{Kind::VECTOR_INTEGER, EC, ScalarSizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
}
/// Get a low-level vector of some number of elements and element type.
static constexpr LLT vector(ElementCount EC, LLT ScalarTy) {
assert(!EC.isScalar() && "invalid number of vector elements");
assert(!ScalarTy.isVector() && "invalid vector element type");
- return LLT{ScalarTy.isPointer(),
- /*isVector=*/true,
- /*isScalar=*/false,
- EC,
- ScalarTy.getSizeInBits().getFixedValue(),
- ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
+
+ Kind Info = toVector(ScalarTy.Info);
+ return LLT{Info, EC, ScalarTy.getSizeInBits().getFixedValue(),
+ ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0,
+ ScalarTy.isFloat() ? ScalarTy.getFPInfo()
+ : static_cast<FPInfo>(0)};
}
+ // Get a 16-bit brain float value.
+ static constexpr LLT bfloat() { return integer(16); }
+
/// Get a 16-bit IEEE half value.
- /// TODO: Add IEEE semantics to type - This currently returns a simple `scalar(16)`.
- static constexpr LLT float16() {
- return scalar(16);
- }
+ static constexpr LLT float16() { return integer(16); }
/// Get a 32-bit IEEE float value.
- static constexpr LLT float32() {
- return scalar(32);
- }
+ static constexpr LLT float32() { return integer(32); }
/// Get a 64-bit IEEE double value.
- static constexpr LLT float64() {
- return scalar(64);
- }
+ static constexpr LLT float64() { return integer(64); }
+
+ /// Get a 80-bit X86 floating point value.
+ static constexpr LLT x86fp80() { return integer(80); }
+
+ /// Get a 128-bit IEEE quad value.
+ static constexpr LLT float128() { return floatingPoint(128, FPInfo::IEEE_FLOAT); }
+
+ /// Get a 128-bit PowerPC double double value.
+ static constexpr LLT ppcf128() { return integer(128); }
/// Get a low-level fixed-width vector of some number of elements and element
/// width.
+ [[deprecated("Use LLT::fixed_vector(unsigned, LLT) instead.")]]
static constexpr LLT fixed_vector(unsigned NumElements,
unsigned ScalarSizeInBits) {
- return vector(ElementCount::getFixed(NumElements), ScalarSizeInBits);
+ return vector(ElementCount::getFixed(NumElements),
+ LLT::integer(ScalarSizeInBits));
}
/// Get a low-level fixed-width vector of some number of elements and element
@@ -110,9 +176,11 @@ class LLT {
/// Get a low-level scalable vector of some number of elements and element
/// width.
+ [[deprecated("Use LLT::scalable_vector(unsigned, LLT) instead.")]]
static constexpr LLT scalable_vector(unsigned MinNumElements,
unsigned ScalarSizeInBits) {
- return vector(ElementCount::getScalable(MinNumElements), ScalarSizeInBits);
+ return vector(ElementCount::getScalable(MinNumElements),
+ LLT::integer(ScalarSizeInBits));
}
/// Get a low-level scalable vector of some number of elements and element
@@ -125,33 +193,77 @@ class LLT {
return EC.isScalar() ? ScalarTy : LLT::vector(EC, ScalarTy);
}
+ [[deprecated("Use LLT::scalarOrVector(EC, LLT) instead.")]]
static constexpr LLT scalarOrVector(ElementCount EC, uint64_t ScalarSize) {
assert(ScalarSize <= std::numeric_limits<unsigned>::max() &&
"Not enough bits in LLT to represent size");
- return scalarOrVector(EC, LLT::scalar(static_cast<unsigned>(ScalarSize)));
+ return scalarOrVector(EC, LLT::integer(static_cast<unsigned>(ScalarSize)));
}
- explicit constexpr LLT(bool isPointer, bool isVector, bool isScalar,
- ElementCount EC, uint64_t SizeInBits,
- unsigned AddressSpace)
+ explicit constexpr LLT(Kind Info, ElementCount EC, uint64_t SizeInBits,
+ unsigned AddressSpace, FPInfo FP)
: LLT() {
- init(isPointer, isVector, isScalar, EC, SizeInBits, AddressSpace);
+ init(Info, EC, SizeInBits, AddressSpace, FP);
}
- explicit constexpr LLT()
- : IsScalar(false), IsPointer(false), IsVector(false), RawData(0) {}
- explicit LLT(MVT VT);
+ explicit LLT(MVT VT, bool EnableFPInfo = false);
+ explicit constexpr LLT() : Info(static_cast<Kind>(0)), RawData(0) {}
- constexpr bool isValid() const { return IsScalar || RawData != 0; }
- constexpr bool isScalar() const { return IsScalar; }
- constexpr bool isToken() const { return IsScalar && RawData == 0; };
- constexpr bool isVector() const { return isValid() && IsVector; }
+ constexpr bool isValid() const {
+ return isToken() || RawData != 0;
+ }
+ constexpr bool isScalar() const {
+ return Info == Kind::INTEGER || Info == Kind::FLOAT;
+ }
+ constexpr bool isScalar(unsigned Size) const {
+ return isScalar() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isFloat() const { return isValid() && Info == Kind::FLOAT; }
+ constexpr bool isFloat(unsigned Size) const {
+ return isFloat() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isVariantFloat() const {
+ return isFloat() && (getFPInfo() == FPInfo::VARIANT_FLOAT_1 ||
+ getFPInfo() == FPInfo::VARIANT_FLOAT_2 ||
+ getFPInfo() == FPInfo::VARIANT_FLOAT_3);
+ }
+ constexpr bool isVariantFloat(FPInfo Variant) const {
+ return isFloat() && getFPInfo() == Variant;
+ }
+ constexpr bool isVariantFloat(unsigned Size, FPInfo Variant) const {
+ return isVariantFloat() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isFloatVector() const {
+ return isVector() && Info == Kind::VECTOR_FLOAT;
+ }
+ constexpr bool isBFloat() const { return isVariantFloat(16, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isX86FP80() const { return isVariantFloat(80, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isPPCF128() const { return isVariantFloat(128, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isToken() const {
+ return Info == Kind::INTEGER && RawData == 0;
+ }
+ constexpr bool isInteger() const {
+ return isValid() && Info == Kind::INTEGER;
+ }
+ constexpr bool isInteger(unsigned Size) const {
+ return isInteger() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isIntegerVector() const {
+ return isVector() && Info == Kind::VECTOR_INTEGER;
+ }
+ constexpr bool isVector() const {
+ return isValid() &&
+ (Info == Kind::VECTOR_INTEGER || Info == Kind::VECTOR_FLOAT ||
+ Info == Kind::VECTOR_POINTER);
+ }
constexpr bool isPointer() const {
- return isValid() && IsPointer && !IsVector;
+ return isValid() && Info == Kind::POINTER;
+ }
+ constexpr bool isPointerVector() const {
+ return isVector() && Info == Kind::VECTOR_POINTER;
}
- constexpr bool isPointerVector() const { return IsPointer && isVector(); }
constexpr bool isPointerOrPointerVector() const {
- return IsPointer && isValid();
+ return isPointer() || isPointerVector();
}
/// Returns the number of elements in a vector LLT. Must only be called on
@@ -176,12 +288,18 @@ class LLT {
/// if the LLT is not a vector type.
constexpr bool isFixedVector() const { return isVector() && !isScalable(); }
+ constexpr bool isFixedVector(unsigned NumElements,
+ unsigned ScalarSize) const {
+ return isFixedVector() && getNumElements() == NumElements &&
+ getScalarSizeInBits() == ScalarSize;
+ }
+
/// Returns true if the LLT is a scalable vector. Returns false otherwise,
/// even if the LLT is not a vector type.
constexpr bool isScalableVector() const { return isVector() && isScalable(); }
constexpr ElementCount getElementCount() const {
- assert(IsVector && "cannot get number of elements on scalar/aggregate");
+ assert(isVector() && "cannot get number of elements on scalar/aggregate");
return ElementCount::get(getFieldValue(VectorElementsFieldInfo),
isScalable());
}
@@ -206,6 +324,13 @@ class LLT {
return isVector() ? getElementType() : *this;
}
+ constexpr FPInfo getFPInfo() const {
+ assert((isFloat() || isFloatVector()) &&
+ "cannot get FP info for non float type");
+
+ return FPInfo(getFieldValue(ScalarFPFieldInfo));
+ }
+
/// If this type is a vector, return a vector with the same number of elements
/// but the new element type. Otherwise, return the new element type.
constexpr LLT changeElementType(LLT NewEltTy) const {
@@ -216,10 +341,10 @@ class LLT {
/// but the new element size. Otherwise, return the new element type. Invalid
/// for pointer types. For pointer types, use changeElementType.
constexpr LLT changeElementSize(unsigned NewEltSize) const {
- assert(!isPointerOrPointerVector() &&
+ assert(!isPointerOrPointerVector() && !(isFloat() || isFloatVector()) &&
"invalid to directly change element size for pointers");
- return isVector() ? LLT::vector(getElementCount(), NewEltSize)
- : LLT::scalar(NewEltSize);
+ return isVector() ? LLT::vector(getElementCount(), LLT::integer(NewEltSize))
+ : LLT::integer(NewEltSize);
}
/// Return a vector or scalar with the same element type and the new element
@@ -228,6 +353,20 @@ class LLT {
return LLT::scalarOrVector(EC, getScalarType());
}
+ constexpr LLT changeElementCount(unsigned NumElements) const {
+ return changeElementCount(ElementCount::getFixed(NumElements));
+ }
+
+ constexpr LLT changeFPInfo(FPInfo FP) const {
+ assert(isFloat() ||
+ isFloatVector() &&
+ "cannot change FPInfo for non floating point types");
+ if (isFloatVector())
+ LLT::vector(getElementCount(), getElementType().changeFPInfo(FP));
+
+ return LLT::floatingPoint(getSizeInBits(), FP);
+ }
+
/// Return a type that is \p Factor times smaller. Reduces the number of
/// elements if this is a vector, or the bitwidth for scalar/pointers. Does
/// not attempt to handle cases that aren't evenly divisible.
@@ -242,7 +381,7 @@ class LLT {
}
assert(getScalarSizeInBits() % Factor == 0);
- return scalar(getScalarSizeInBits() / Factor);
+ return integer(getScalarSizeInBits() / Factor);
}
/// Produce a vector type that is \p Factor times bigger, preserving the
@@ -276,10 +415,23 @@ class LLT {
/// Returns the vector's element type. Only valid for vector types.
constexpr LLT getElementType() const {
assert(isVector() && "cannot get element type of scalar/aggregate");
- if (IsPointer)
+ if (isPointerVector())
return pointer(getAddressSpace(), getScalarSizeInBits());
- else
- return scalar(getScalarSizeInBits());
+
+ if (isFloatVector())
+ return floatingPoint(getScalarSizeInBits(), getFPInfo());
+
+ return integer(getScalarSizeInBits());
+ }
+
+ constexpr LLT dropType() const {
+ if (isPointer() || isPointerVector())
+ return *this;
+
+ if (isVector())
+ return vector(getElementCount(), LLT::integer(getScalarSizeInBits()));
+
+ return integer(getSizeInBits());
}
void print(raw_ostream &OS) const;
@@ -289,8 +441,7 @@ class LLT {
#endif
constexpr bool operator==(const LLT &RHS) const {
- return IsPointer == RHS.IsPointer && IsVector == RHS.IsVector &&
- IsScalar == RHS.IsScalar && RHS.RawData == RawData;
+ return Info == RHS.Info && RawData == RHS.RawData;
}
constexpr bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
@@ -300,37 +451,33 @@ class LLT {
private:
/// LLT is packed into 64 bits as follows:
- /// isScalar : 1
- /// isPointer : 1
- /// isVector : 1
- /// with 61 bits remaining for Kind-specific data, packed in bitfields
- /// as described below. As there isn't a simple portable way to pack bits
- /// into bitfields, here the different fields in the packed structure is
+ /// Info : 3
+ /// RawData : 61
+ /// with 61 bits of RawData remaining for Kind-specific data, packed in
+ /// bitfields as described below. As there isn't a simple portable way to pack
+ /// bits into bitfields, here the different fields in the packed structure is
/// described in static const *Field variables. Each of these variables
/// is a 2-element array, with the first element describing the bitfield size
/// and the second element describing the bitfield offset.
///
- /// +--------+---------+--------+----------+----------------------+
- /// |isScalar|isPointer|isVector| RawData |Notes |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 0 | 0 |Invalid |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 1 | 0 |Tombstone Key |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 0 | 0 |Empty Key |
- /// +--------+---------+--------+----------+----------------------+
- /// | 1 | 0 | 0 | 0 |Token |
- /// +--------+---------+--------+----------+----------------------+
- /// | 1 | 0 | 0 | non-zero |Scalar |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 0 | non-zero |Pointer |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 1 | non-zero |Vector of non-pointer |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 1 | non-zero |Vector of pointer |
- /// +--------+---------+--------+----------+----------------------+
- ///
- /// Everything else is reserved.
+ /*
+ --- LLT ---
+
+ 63 56 47 39 31 23 15 7 0
+ | | | | | | | | |
+ |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
+ ................................... (1)
+ ***************** (2)
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~ (3)
+ ^^^^^^^^^^^^^^^^^ (4)
+ @ (5)
+ ### (6)
+ %%% (7)
+
+ (1) ScalarSize (2) PointerSize (3) PointerAddressSpace
+ (4) VectorElements (5) VectorScalable (6) FPInfo (7) Kind
+
+ */
typedef int BitFieldInfo[2];
///
/// This is how the bitfields are packed per Kind:
@@ -340,6 +487,7 @@ class LLT {
/// * Non-pointer scalar (isPointer == 0 && isVector == 0):
/// SizeInBits: 32;
static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
+ static const constexpr BitFieldInfo ScalarFPFieldInfo{2, 21};
/// * Pointer (isPointer == 1 && isVector == 0):
/// SizeInBits: 16;
/// AddressSpace: 24;
@@ -357,20 +505,20 @@ class LLT {
/// AddressSpace: 24;
/// Scalable: 1;
- uint64_t IsScalar : 1;
- uint64_t IsPointer : 1;
- uint64_t IsVector : 1;
+ Kind Info : 3;
uint64_t RawData : 61;
static constexpr uint64_t getMask(const BitFieldInfo FieldInfo) {
const int FieldSizeInBits = FieldInfo[0];
return (((uint64_t)1) << FieldSizeInBits) - 1;
}
+
static constexpr uint64_t maskAndShift(uint64_t Val, uint64_t Mask,
uint8_t Shift) {
assert(Val <= Mask && "Value too large for field");
return (Val & Mask) << Shift;
}
+
static constexpr uint64_t maskAndShift(uint64_t Val,
const BitFieldInfo FieldInfo) {
return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
@@ -380,21 +528,20 @@ class LLT {
return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
}
- constexpr void init(bool IsPointer, bool IsVector, bool IsScalar,
- ElementCount EC, uint64_t SizeInBits,
- unsigned AddressSpace) {
+ constexpr void init(Kind Info, ElementCount EC, uint64_t SizeInBits,
+ unsigned AddressSpace, FPInfo FP) {
assert(SizeInBits <= std::numeric_limits<unsigned>::max() &&
"Not enough bits in LLT to represent size");
- this->IsPointer = IsPointer;
- this->IsVector = IsVector;
- this->IsScalar = IsScalar;
- if (IsPointer) {
+ this->Info = Info;
+ if (Info == Kind::POINTER || Info == Kind::VECTOR_POINTER) {
RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
} else {
- RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
+ RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo) |
+ maskAndShift((uint64_t) FP, ScalarFPFieldInfo);
}
- if (IsVector) {
+
+ if (Info == Kind::VECTOR_INTEGER || Info == Kind::VECTOR_FLOAT || Info == Kind::VECTOR_POINTER) {
RawData |= maskAndShift(EC.getKnownMinValue(), VectorElementsFieldInfo) |
maskAndShift(EC.isScalable() ? 1 : 0, VectorScalableFieldInfo);
}
@@ -402,25 +549,24 @@ class LLT {
public:
constexpr uint64_t getUniqueRAWLLTData() const {
- return ((uint64_t)RawData) << 3 | ((uint64_t)IsScalar) << 2 |
- ((uint64_t)IsPointer) << 1 | ((uint64_t)IsVector);
+ return ((uint64_t)RawData) << 3 | ((uint64_t)Info);
}
};
-inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
+inline raw_ostream &operator<<(raw_ostream &OS, const LLT &Ty) {
Ty.print(OS);
return OS;
}
-template<> struct DenseMapInfo<LLT> {
+template <> struct DenseMapInfo<LLT> {
static inline LLT getEmptyKey() {
LLT Invalid;
- Invalid.IsPointer = true;
+ Invalid.Info = static_cast<LLT::Kind>(2);
return Invalid;
}
static inline LLT getTombstoneKey() {
LLT Invalid;
- Invalid.IsVector = true;
+ Invalid.Info = static_cast<LLT::Kind>(3);
return Invalid;
}
static inline unsigned getHashValue(const LLT &Ty) {
@@ -428,10 +574,10 @@ template<> struct DenseMapInfo<LLT> {
return DenseMapInfo<uint64_t>::getHashValue(Val);
}
static bool isEqual(const LLT &LHS, const LLT &RHS) {
- return LHS == RHS;
+ return LHS.getUniqueRAWLLTData() == RHS.getUniqueRAWLLTData();
}
};
-}
+} // namespace llvm
#endif // LLVM_CODEGEN_LOWLEVELTYPE_H
diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
index 936c9fbb2fff02..f229d954db3f2b 100644
--- a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
+++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
@@ -15,12 +15,13 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
-LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL, bool EnableFPInfo) {
if (auto VTy = dyn_cast<VectorType>(&Ty)) {
auto EC = VTy->getElementCount();
- LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL);
+ LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL, EnableFPInfo);
if (EC.isScalar())
return ScalarTy;
return LLT::vector(EC, ScalarTy);
@@ -36,7 +37,37 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
// concerned.
auto SizeInBits = DL.getTypeSizeInBits(&Ty);
assert(SizeInBits != 0 && "invalid zero-sized type");
- return LLT::scalar(SizeInBits);
+
+ if (Ty.isFloatingPointTy()) {
+ if (Ty.isHalfTy())
+ return LLT::float16();
+
+ if (Ty.isBFloatTy())
+ return LLT::bfloat();
+
+ if (Ty.isFloatTy())
+ return LLT::float32();
+
+ if (Ty.isDoubleTy())
+ return LLT::float64();
+
+ if (Ty.isX86_FP80Ty())
+ return LLT::x86fp80();
+
+ if (Ty.isFP128Ty())
+ return LLT::float128();
+
+ if (Ty.isPPC_FP128Ty())
+ return LLT::ppcf128();
+
+ llvm_unreachable("Unhandled LLVM IR floating point type");
+ }
+
+ if (Ty.isIntegerTy()) {
+ return LLT::integer(SizeInBits);
+ }
+
+ return LLT::integer(SizeInBits);
}
if (Ty.isTokenTy())
@@ -45,13 +76,26 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
return LLT();
}
-MVT llvm::getMVTForLLT(LLT Ty) {
- if (!Ty.isVector())
- return MVT::getIntegerVT(Ty.getSizeInBits());
+MVT llvm::getMVTForLLT(LLT Ty, bool EnableFPInfo) {
+ if (Ty.isVector()) {
+ return MVT::getVectorVT(getMVTForLLT(Ty.getElementType()), Ty.getElementCount());
+ }
+
+ if (Ty.isFloat()) {
+ if (Ty == LLT::bfloat())
+ return MVT::bf16;
+
+ if (Ty == LLT::x86fp80())
+ return MVT::f80;
+
+ if (Ty == LLT::ppcf128())
+ return MVT::ppcf128;
+
+ return MVT::getFloatingPointVT(Ty.getSizeInBits());
+ }
- return MVT::getVectorVT(
- MVT::getIntegerVT(Ty.getElementType().getSizeInBits()),
- Ty.getElementCount());
+
+ return MVT::getIntegerVT(Ty.getSizeInBits());
}
EVT llvm::getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx) {
@@ -63,16 +107,29 @@ EVT llvm::getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx) {
return EVT::getIntegerVT(Ctx, Ty.getSizeInBits());
}
-LLT llvm::getLLTForMVT(MVT Ty) {
+LLT llvm::getLLTForMVT(MVT Ty, bool EnableFPInfo) {
+ if (EnableFPInfo)
+ return LLT(Ty);
+
if (!Ty.isVector())
- return LLT::scalar(Ty.getSizeInBits());
+ return LLT::integer(Ty.getSizeInBits());
return LLT::scalarOrVector(Ty.getVectorElementCount(),
- Ty.getVectorElementType().getSizeInBits());
+ LLT::integer(Ty.getVectorElementType().getSizeInBits()));
}
const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) {
assert(Ty.isScalar() && "Expected a scalar type.");
+
+ if (Ty.isBFloat())
+ return APFloat::BFloat();
+ if (Ty.isX86FP80())
+ return APFloat::x87DoubleExtended();
+ if (Ty.isPPCF128())
+ return APFloat::PPCDoubleDouble();
+
+ assert(!Ty.isVariantFloat() && "Unhandled variant float type");
+
switch (Ty.getSizeInBits()) {
case 16:
return APFloat::IEEEhalf();
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f2652b00e8..947b22de67cff1 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -16,36 +16,72 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-LLT::LLT(MVT VT) {
- if (VT.isVector()) {
- bool asVector = VT.getVectorMinNumElements() > 1 || VT.isScalableVector();
- init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector,
- VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(),
- /*AddressSpace=*/0);
- } else if (VT.isValid() && !VT.isScalableTargetExtVT()) {
- // Aggregates are no different from real scalars as far as GlobalISel is
- // concerned.
- init(/*IsPointer=*/false, /*IsVector=*/false, /*IsScalar=*/true,
- ElementCount::getFixed(0), VT.getSizeInBits(), /*AddressSpace=*/0);
- } else {
- IsScalar = false;
- IsPointer = false;
- IsVector = false;
- RawData = 0;
+cl::opt<bool> llvm::EnableFPInfo(
+ "enable-fpinfo",
+ cl::desc("Enable low level types to carry floating point information"),
+ cl::Optional, cl::Hidden, cl::init(false));
+
+static std::optional<LLT::FPInfo> deriveFPInfo(MVT VT) {
+ if (!VT.isFloatingPoint())
+ return std::nullopt;
+
+ switch (VT.getScalarType().SimpleTy) {
+ case MVT::bf16:
+ case MVT::ppcf128:
+ return LLT::FPInfo::VARIANT_FLOAT_1;
+ default:
+ return LLT::FPInfo::IEEE_FLOAT;
}
}
+LLT::LLT(MVT VT, bool EnableFPInfo) {
+ auto FP = EnableFPInfo ? deriveFPInfo(VT) : std::nullopt;
+ bool AsVector = VT.isVector() && (VT.getVectorMinNumElements() > 1 || VT.isScalableVector());
+
+ Kind Info;
+ if (EnableFPInfo && FP.has_value())
+ Info = AsVector ? Kind::VECTOR_FLOAT : Kind::FLOAT;
+ else
+ Info = AsVector ? Kind::VECTOR_INTEGER : Kind::INTEGER;
+
+ if (VT.isVector()) {
+ init(Info,
+ VT.getVectorElementCount(),
+ VT.getVectorElementType().getSizeInBits(),
+ /*AddressSpace=*/0, FP.value_or(FPInfo::IEEE_FLOAT));
+ } else if (VT.isValid() && !VT.isScalableTargetExtVT()) {
+ // Aggregates are no different from real scalars as far as GlobalISel is
+ // concerned.
+ init(Info, ElementCount::getFixed(0), VT.getSizeInBits(),
+ /*AddressSpace=*/0, FP.value_or(FPInfo::IEEE_FLOAT));
+ } else {
+ this->Info = static_cast<Kind>(0);
+ this->RawData = 0;
+ }
+ }
+
void LLT::print(raw_ostream &OS) const {
+ constexpr bool EnableFPInfo = false;
if (isVector()) {
OS << "<";
OS << getElementCount() << " x " << getElementType() << ">";
- } else if (isPointer())
+ } else if (isPointer()) {
OS << "p" << getAddressSpace();
- else if (isValid()) {
+ } else if (EnableFPInfo && isBFloat()) {
+ OS << "bf16";
+ } else if (EnableFPInfo && isPPCF128()) {
+ OS << "ppcf128";
+ } else if (EnableFPInfo && isFloat()) {
+ assert(!isVariantFloat() && "unknown float variant");
+ OS << "f" << getScalarSizeInBits();
+ } else if (EnableFPInfo && isInteger()) {
+ OS << "i" << getScalarSizeInBits();
+ } else if (isValid()) {
assert(isScalar() && "unexpected type");
OS << "s" << getScalarSizeInBits();
- } else
+ } else {
OS << "LLT_invalid";
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -56,6 +92,7 @@ LLVM_DUMP_METHOD void LLT::dump() const {
#endif
const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::ScalarFPFieldInfo;
const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
>From bb1b54b66fde42073bbf848883df2551c9b355e6 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Mon, 11 Nov 2024 16:33:31 +0100
Subject: [PATCH 02/12] add FP LLTs to TableGen
---
.../GlobalISel/GlobalISelMatchTable.cpp | 101 +++++++++++-------
1 file changed, 62 insertions(+), 39 deletions(-)
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index a81f2b53f2846e..fe2757ac66a3c6 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -349,42 +349,95 @@ std::string LLTCodeGen::getCxxEnumValue() const {
void LLTCodeGen::emitCxxEnumValue(raw_ostream &OS) const {
if (Ty.isScalar()) {
- OS << "GILLT_s" << Ty.getSizeInBits();
+ if (Ty.isBFloat())
+ OS << "GILLT_bf16";
+ else if (Ty.isPPCF128())
+ OS << "GILLT_ppcf128";
+ else if (Ty.isX86FP80())
+ OS << "GILLT_x86fp80";
+ else if (Ty.isFloat())
+ OS << "GILLT_f" << Ty.getSizeInBits();
+ else if (Ty.isInteger())
+ OS << "GILLT_i" << Ty.getSizeInBits();
+ else
+ OS << "GILLT_s" << Ty.getSizeInBits();
return;
}
if (Ty.isVector()) {
OS << (Ty.isScalable() ? "GILLT_nxv" : "GILLT_v")
- << Ty.getElementCount().getKnownMinValue() << "s"
- << Ty.getScalarSizeInBits();
+ << Ty.getElementCount().getKnownMinValue();
+
+ LLT ElemTy = Ty.getElementType();
+ if (ElemTy.isBFloat())
+ OS << "bf16";
+ else if (ElemTy.isPPCF128())
+ OS << "ppcf128";
+ else if (ElemTy.isX86FP80())
+ OS << "x86fp80";
+ else if (ElemTy.isFloat())
+ OS << "f" << ElemTy.getSizeInBits();
+ else if (Ty.isInteger())
+ OS << "i" << ElemTy.getSizeInBits();
+ else
+ OS << "s" << ElemTy.getSizeInBits();
return;
}
+
if (Ty.isPointer()) {
OS << "GILLT_p" << Ty.getAddressSpace();
if (Ty.getSizeInBits() > 0)
OS << "s" << Ty.getSizeInBits();
return;
}
+
llvm_unreachable("Unhandled LLT");
}
void LLTCodeGen::emitCxxConstructorCall(raw_ostream &OS) const {
if (Ty.isScalar()) {
- OS << "LLT::scalar(" << Ty.getSizeInBits() << ")";
+ if (Ty.isInteger())
+ OS << "LLT::integer(" << Ty.getScalarSizeInBits() << ")";
+ else if (Ty.isBFloat())
+ OS << "LLT::bfloat()";
+ else if (Ty.isPPCF128())
+ OS << "LLT::ppcf128()";
+ else if (Ty.isX86FP80())
+ OS << "LLT::x86fp80()";
+ else if (Ty.isFloat())
+ OS << "LLT::floatingPoint(" << Ty.getScalarSizeInBits()
+ << ", LLT::FPInfo::IEEE_FLOAT)";
return;
}
+
if (Ty.isVector()) {
OS << "LLT::vector("
<< (Ty.isScalable() ? "ElementCount::getScalable("
: "ElementCount::getFixed(")
- << Ty.getElementCount().getKnownMinValue() << "), "
- << Ty.getScalarSizeInBits() << ")";
+ << Ty.getElementCount().getKnownMinValue() << "), ";
+
+ LLT ElemTy = Ty.getElementType();
+ if (ElemTy.isInteger())
+ OS << "LLT::integer(" << ElemTy.getScalarSizeInBits() << ")";
+ else if (ElemTy.isBFloat())
+ OS << "LLT::bfloat()";
+ else if (ElemTy.isPPCF128())
+ OS << "LLT::ppcf128()";
+ else if (ElemTy.isX86FP80())
+ OS << "LLT::x86fp80()";
+ else if (ElemTy.isFloat())
+ OS << "LLT::floatingPoint(" << ElemTy.getScalarSizeInBits()
+ << ", LLT::FPInfo::IEEE_FLOAT)";
+
+ OS << ")";
return;
}
+
if (Ty.isPointer() && Ty.getSizeInBits() > 0) {
OS << "LLT::pointer(" << Ty.getAddressSpace() << ", " << Ty.getSizeInBits()
<< ")";
return;
}
+
llvm_unreachable("Unhandled LLT");
}
@@ -392,36 +445,7 @@ void LLTCodeGen::emitCxxConstructorCall(raw_ostream &OS) const {
/// particular logic behind the order but either A < B or B < A must be
/// true if A != B.
bool LLTCodeGen::operator<(const LLTCodeGen &Other) const {
- if (Ty.isValid() != Other.Ty.isValid())
- return Ty.isValid() < Other.Ty.isValid();
- if (!Ty.isValid())
- return false;
-
- if (Ty.isVector() != Other.Ty.isVector())
- return Ty.isVector() < Other.Ty.isVector();
- if (Ty.isScalar() != Other.Ty.isScalar())
- return Ty.isScalar() < Other.Ty.isScalar();
- if (Ty.isPointer() != Other.Ty.isPointer())
- return Ty.isPointer() < Other.Ty.isPointer();
-
- if (Ty.isPointer() && Ty.getAddressSpace() != Other.Ty.getAddressSpace())
- return Ty.getAddressSpace() < Other.Ty.getAddressSpace();
-
- if (Ty.isVector() && Ty.getElementCount() != Other.Ty.getElementCount())
- return std::tuple(Ty.isScalable(),
- Ty.getElementCount().getKnownMinValue()) <
- std::tuple(Other.Ty.isScalable(),
- Other.Ty.getElementCount().getKnownMinValue());
-
- assert((!Ty.isVector() || Ty.isScalable() == Other.Ty.isScalable()) &&
- "Unexpected mismatch of scalable property");
- return Ty.isVector()
- ? std::tuple(Ty.isScalable(),
- Ty.getSizeInBits().getKnownMinValue()) <
- std::tuple(Other.Ty.isScalable(),
- Other.Ty.getSizeInBits().getKnownMinValue())
- : Ty.getSizeInBits().getFixedValue() <
- Other.Ty.getSizeInBits().getFixedValue();
+ return Ty.getUniqueRAWLLTData() < Other.Ty.getUniqueRAWLLTData();
}
//===- LLTCodeGen Helpers -------------------------------------------------===//
@@ -430,11 +454,10 @@ std::optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
MVT VT(SVT);
if (VT.isVector() && !VT.getVectorElementCount().isScalar())
- return LLTCodeGen(
- LLT::vector(VT.getVectorElementCount(), VT.getScalarSizeInBits()));
+ return LLTCodeGen(LLT(VT, true));
if (VT.isInteger() || VT.isFloatingPoint())
- return LLTCodeGen(LLT::scalar(VT.getSizeInBits()));
+ return LLTCodeGen(LLT(VT, true));
return std::nullopt;
}
>From 79218c4dafcdd6761f6f6ab62d200db1405c8f61 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 14 Nov 2024 11:32:17 +0000
Subject: [PATCH 03/12] use isScalar and isFixedVector
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +-
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 44 +++----
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 2 +-
.../Target/AMDGPU/AMDGPUCombinerHelper.cpp | 4 +-
.../AMDGPUGlobalISelDivergenceLowering.cpp | 6 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 42 ++++---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 113 ++++++++----------
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 10 +-
.../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 7 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 4 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 34 +++---
11 files changed, 124 insertions(+), 144 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..c258ba3379ec1b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6771,7 +6771,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
LLT TrueTy = MRI.getType(Select->getTrueReg());
// We only do this combine for scalar boolean conditions.
- if (CondTy != LLT::scalar(1))
+ if (!CondTy.isScalar(1))
return false;
if (TrueTy.isPointer())
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d0a62340a5f322..a4239f2567146d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2099,7 +2099,7 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
const unsigned Offset = (I - 1) * PartSize;
Register SrcReg = MI.getOperand(I).getReg();
- assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
+ assert(MRI.getType(SrcReg).isScalar(PartSize));
auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
@@ -6596,7 +6596,7 @@ LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
// If all finite floats fit into the narrowed integer type, we can just swap
// out the result type. This is practically only useful for conversions from
// half to at least 16-bits, so just handle the one case.
- if (SrcTy.getScalarType() != LLT::scalar(16) ||
+ if (!SrcTy.getScalarType().isScalar(16) ||
NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
return UnableToLegalize;
@@ -7471,7 +7471,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
// unsigned cul2f(ulong u) {
// uint lz = clz(u);
@@ -7529,7 +7529,7 @@ LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
// For i64 < INT_MAX we simply reuse SITOFP.
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit
@@ -7563,7 +7563,7 @@ LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(64));
// We create double value from 32 bit parts with 32 exponent difference.
// Note that + and - are float operations that adjust the implicit leading
@@ -7595,7 +7595,7 @@ LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- if (SrcTy == LLT::scalar(1)) {
+ if (SrcTy.isScalar(1)) {
auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
MIRBuilder.buildSelect(Dst, Src, True, False);
@@ -7603,17 +7603,17 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
return Legalized;
}
- if (SrcTy != LLT::scalar(64))
+ if (!SrcTy.isScalar(64))
return UnableToLegalize;
- if (DstTy == LLT::scalar(32))
+ if (DstTy.isScalar(32))
// TODO: SelectionDAG has several alternative expansions to port which may
// be more reasonable depending on the available instructions. We also need
// a more advanced mechanism to choose an optimal version depending on
// target features such as sitofp or CTLZ availability.
return lowerU64ToF32WithSITOFP(MI);
- if (DstTy == LLT::scalar(64))
+ if (DstTy.isScalar(64))
return lowerU64ToF64BitFloatOps(MI);
return UnableToLegalize;
@@ -7626,7 +7626,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- if (SrcTy == S1) {
+ if (SrcTy.isScalar(1)) {
auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
MIRBuilder.buildSelect(Dst, Src, True, False);
@@ -7634,10 +7634,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
return Legalized;
}
- if (SrcTy != S64)
+ if (!SrcTy.isScalar(64))
return UnableToLegalize;
- if (DstTy == S32) {
+ if (DstTy.isScalar(32)) {
// signed cl2f(long l) {
// long s = l >> 63;
// float r = cul2f((l + s) ^ s);
@@ -7664,12 +7664,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
- if (SrcTy != S64 && SrcTy != S32)
+ if (!SrcTy.isScalar(64) && !SrcTy.isScalar(32))
return UnableToLegalize;
- if (DstTy != S32 && DstTy != S64)
+ if (!DstTy.isScalar(32) && !DstTy.isScalar(64))
return UnableToLegalize;
// FPTOSI gives same result as FPTOUI for positive signed integers.
@@ -7704,11 +7702,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
// FIXME: Only f32 to i64 conversions are supported.
- if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
+ if (!SrcTy.getScalarType().isScalar(32) || !DstTy.getScalarType().isScalar(64))
return UnableToLegalize;
// Expand f32 -> i64 conversion
@@ -7873,8 +7869,8 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
auto [Dst, Src] = MI.getFirst2Regs();
- assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
- MRI.getType(Src).getScalarType() == LLT::scalar(64));
+ assert(MRI.getType(Dst).getScalarType().isScalar(16) &&
+ MRI.getType(Src).getScalarType().isScalar(64));
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
@@ -7985,10 +7981,8 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
auto [DstTy, SrcTy] = MI.getFirst2LLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S16 = LLT::scalar(16);
- if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
+ if (DstTy.getScalarType().isScalar(16) && SrcTy.getScalarType().isScalar(64))
return lowerFPTRUNC_F64_TO_F16(MI);
return UnableToLegalize;
@@ -9263,7 +9257,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
// The condition was potentially zero extended before, but we want a sign
// extended boolean.
- if (MaskTy != LLT::scalar(1))
+ if (!MaskTy.isScalar(1))
MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
// Continue the sign extension (or truncate) to match the data type.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index bb00442342d843..666615202a4b5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -72,7 +72,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
if (TRI->isSGPRReg(MRI, PhysReg)) {
LLT Ty = MRI.getType(ExtReg);
LLT S32 = LLT::scalar(32);
- if (Ty != S32) {
+ if (!Ty.isScalar(32)) {
// FIXME: We should probably support readfirstlane intrinsics with all
// legal 32-bit types.
assert(Ty.getSizeInBits() == 32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 46194ab46ff6a7..22cd79c8cc2058 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -409,7 +409,7 @@ static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
const MachineInstr *Def = MRI.getVRegDef(Reg);
if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
Register SrcReg = Def->getOperand(1).getReg();
- return MRI.getType(SrcReg) == LLT::scalar(16);
+ return MRI.getType(SrcReg).isScalar(16);
}
if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
@@ -428,7 +428,7 @@ bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
Register Src2) const {
assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
Register SrcReg = MI.getOperand(1).getReg();
- if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
+ if (!MRI.hasOneNonDBGUse(SrcReg) || !MRI.getType(SrcReg).isScalar(32))
return false;
return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index fb258547e8fb90..d96d1f5ad39f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -87,7 +87,7 @@ DivergenceLoweringHelper::DivergenceLoweringHelper(
// _(s1) -> SReg_32/64(s1)
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
- assert(MRI->getType(DstReg) == LLT::scalar(1));
+ assert(MRI->getType(DstReg).isScalar(1));
if (MRI->getRegClassOrNull(DstReg)) {
if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
@@ -100,13 +100,11 @@ void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
void DivergenceLoweringHelper::getCandidatesForLowering(
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
- LLT S1 = LLT::scalar(1);
-
// Add divergent i1 phis to the list
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB.phis()) {
Register Dst = MI.getOperand(0).getReg();
- if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
+ if (MRI->getType(Dst).isScalar(1) && MUI->isDivergent(Dst))
Vreg1Phis.push_back(&MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 926c1e4b23b4a1..4a3cbc9bc00d09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -105,7 +105,7 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
MachineOperand &Src = MI.getOperand(1);
// TODO: This should be legalized to s32 if needed
- if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
+ if (MRI->getType(Dst.getReg()).isScalar(1))
return false;
const TargetRegisterClass *DstRC
@@ -225,7 +225,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
// - divergent S1 G_PHI should go through lane mask merging algorithm
// and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
// - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
- if (DefTy == LLT::scalar(1))
+ if (DefTy.isScalar(1))
return false;
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
@@ -651,9 +651,9 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
// Selection logic below is for V2S16 only.
// For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
Register Dst = MI.getOperand(0).getReg();
- if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
+ if (!MRI->getType(Dst).isFixedVector(2, 16) ||
(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
- SrcTy != LLT::scalar(32)))
+ !SrcTy.isScalar(32)))
return selectImpl(MI, *CoverageInfo);
const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
@@ -991,9 +991,9 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
LLT Ty = MRI->getType(Dst0);
unsigned Opc;
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
Opc = AMDGPU::V_DIV_SCALE_F32_e64;
- else if (Ty == LLT::scalar(64))
+ else if (Ty.isScalar(64))
Opc = AMDGPU::V_DIV_SCALE_F64_e64;
else
return false;
@@ -2305,11 +2305,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- const LLT S1 = LLT::scalar(1);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
const RegisterBank *DstRB;
- if (DstTy == S1) {
+ if (DstTy.isScalar(1)) {
// This is a special case. We don't treat s1 for legalization artifacts as
// vcc booleans.
DstRB = SrcRB;
@@ -2347,7 +2346,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return true;
}
- if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
+ if (DstTy.isFixedVector(2, 16) && SrcTy.isFixedVector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -2639,8 +2638,7 @@ static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
return false;
- assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
- LLT::fixed_vector(2, 16));
+ assert(MRI.getType(Shuffle->getOperand(0).getReg()).isFixedVector(2, 16));
ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);
@@ -2664,8 +2662,8 @@ bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
Register Src = I.getOperand(1).getReg();
- if (MRI->getType(Dst) == LLT::scalar(32) &&
- MRI->getType(Src) == LLT::scalar(16)) {
+ if (MRI->getType(Dst).isScalar(32) &&
+ MRI->getType(Src).isScalar(16)) {
if (isExtractHiElt(*MRI, Src, Src)) {
MachineBasicBlock *BB = I.getParent();
BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
@@ -2693,7 +2691,7 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
- MRI->getType(Dst) != LLT::scalar(64))
+ !MRI->getType(Dst).isScalar(64))
return false;
Register Src = MI.getOperand(1).getReg();
@@ -2739,7 +2737,7 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
- MRI->getType(Dst) != LLT::scalar(64))
+ !MRI->getType(Dst).isScalar(64))
return false;
Register Src = MI.getOperand(1).getReg();
@@ -2911,7 +2909,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
// RegBankSelect knows what it's doing if the branch condition is scc, even
// though it currently does not.
if (!isVCC(CondReg, *MRI)) {
- if (MRI->getType(CondReg) != LLT::scalar(32))
+ if (!MRI->getType(CondReg).isScalar(32))
return false;
CondPhysReg = AMDGPU::SCC;
@@ -3374,7 +3372,7 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
Register ZExtSrc;
if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ return MRI.getType(ZExtSrc).isScalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
@@ -3382,7 +3380,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ MRI.getType(Def->getOperand(0).getReg()).isScalar(64));
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
@@ -3972,7 +3970,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
// This is a workaround. For extension from type i1, `selectImpl()` uses
// patterns from TD file and generates an illegal VGPR to SGPR COPY as type
// i1 can only be hold in a SGPR class.
- if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
+ if (!MRI->getType(I.getOperand(1).getReg()).isScalar(1) &&
selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
@@ -4199,7 +4197,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
if (MI->getOpcode() == AMDGPU::G_FNEG &&
// It's possible to see an f32 fneg here, but unlikely.
// TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
+ MRI.getType(Src).isFixedVector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
@@ -5697,7 +5695,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
if (!EncodedOffset)
return std::nullopt;
- assert(MRI->getType(SOffset) == LLT::scalar(32));
+ assert(MRI->getType(SOffset).isScalar(32));
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
@@ -5712,7 +5710,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
- assert(MRI->getType(Src) == LLT::scalar(16));
+ assert(MRI->getType(Src).isScalar(16));
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9836e10c36bc5d..439cc78ed705e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -275,7 +275,7 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
if (!QueryTy.isVector())
return false;
const LLT EltTy = QueryTy.getElementType();
- return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
+ return EltTy.isScalar(16) || EltTy.getSizeInBits() >= 32;
};
}
@@ -2470,7 +2470,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
// result = trunc(src)
// if (src > 0.0 && src != result)
@@ -2533,7 +2533,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
// TODO: Should this use extract since the low half is unused?
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
@@ -2580,12 +2580,12 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
auto ThirtyTwo = B.buildConstant(S32, 32);
- if (MRI.getType(Dst) == S64) {
+ if (MRI.getType(Dst).isScalar(64)) {
auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
: B.buildUITOFP(S64, Unmerge.getReg(1));
@@ -2598,7 +2598,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
return true;
}
- assert(MRI.getType(Dst) == S32);
+ assert(MRI.getType(Dst).isScalar(32));
auto One = B.buildConstant(S32, 1);
@@ -2639,7 +2639,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
const LLT S32 = LLT::scalar(32);
const LLT SrcLT = MRI.getType(Src);
- assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
+ assert((SrcLT.isScalar(32) || SrcLT.isScalar(64)) && MRI.getType(Dst).isScalar(64));
unsigned Flags = MI.getFlags();
@@ -2654,7 +2654,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
//
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
MachineInstrBuilder Sign;
- if (Signed && SrcLT == S32) {
+ if (Signed && SrcLT.isScalar(32)) {
// However, a 32-bit floating point number has only 23 bits mantissa and
// it's not enough to hold all the significant bits of `lof` if val is
// negative. To avoid the loss of precision, We need to take the absolute
@@ -2664,7 +2664,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
Trunc = B.buildFAbs(S32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
- if (SrcLT == S64) {
+ if (SrcLT.isScalar(64)) {
K0 = B.buildFConstant(
S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
K1 = B.buildFConstant(
@@ -2680,11 +2680,11 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
+ auto Hi = (Signed && SrcLT.isScalar(64)) ? B.buildFPTOSI(S32, FloorMul)
: B.buildFPTOUI(S32, FloorMul);
auto Lo = B.buildFPTOUI(S32, Fma);
- if (Signed && SrcLT == S32) {
+ if (Signed && SrcLT.isScalar(32)) {
// Flip the result based on the signedness, which is either all 0s or 1s.
Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
// r := xor({lo, hi}, sign) - sign;
@@ -3257,7 +3257,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
break;
}
case TargetOpcode::G_FPEXT: {
- return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
+ return MRI.getType(DefMI->getOperand(1).getReg()).isScalar(16);
}
default:
return false;
@@ -3314,7 +3314,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();
- if (Ty == LLT::scalar(16)) {
+ if (Ty.isScalar(16)) {
const LLT F32 = LLT::scalar(32);
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
@@ -3326,7 +3326,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
return true;
}
- assert(Ty == LLT::scalar(32));
+ assert(Ty.isScalar(32));
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
if (!ScaledInput) {
@@ -3473,7 +3473,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
LLT Ty = B.getMRI()->getType(Dst);
- if (Ty == LLT::scalar(32)) {
+ if (Ty.isScalar(32)) {
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
if (ScaledInput) {
auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
@@ -3496,7 +3496,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
}
}
- auto Log2Operand = Ty == LLT::scalar(16)
+ auto Log2Operand = Ty.isScalar(16)
? B.buildFLog2(Ty, Src, Flags)
: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
@@ -3856,13 +3856,13 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
Register Dst = MI.getOperand(0).getReg();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
- assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
+ assert(MRI.getType(Dst).isFixedVector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
- assert(MRI.getType(Src0) == S32);
+ assert(MRI.getType(Src0).isScalar(32));
Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
}
@@ -4453,7 +4453,7 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
Align Alignment) const {
Register DstReg = MI.getOperand(0).getReg();
- assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
+ assert(B.getMRI()->getType(DstReg).isScalar(32) &&
"unexpected kernarg parameter type");
Register Ptr = getKernargParameterPtr(B, Offset);
@@ -4470,15 +4470,12 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
- LLT S16 = LLT::scalar(16);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
- if (DstTy == S16)
+ if (DstTy.isScalar(16))
return legalizeFDIV16(MI, MRI, B);
- if (DstTy == S32)
+ if (DstTy.isScalar(32))
return legalizeFDIV32(MI, MRI, B);
- if (DstTy == S64)
+ if (DstTy.isScalar(64))
return legalizeFDIV64(MI, MRI, B);
return false;
@@ -4706,16 +4703,14 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
}
}
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty == S32)
+ if (Ty.isScalar(32))
legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
- else if (Ty == S64)
+ else if (Ty.isScalar(64))
legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
@@ -4727,11 +4722,10 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty != S32 && Ty != S64)
+ if (!Ty.isScalar(32) && !Ty.isScalar(64))
return false;
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
@@ -4771,7 +4765,7 @@ bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
}
}
- if (Ty == S32)
+ if (Ty.isScalar(32))
legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
@@ -4806,7 +4800,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
MF.getTarget().Options.UnsafeFPMath;
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
- if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
+ if (!AllowInaccurateRcp && !ResTy.isScalar(16))
return false;
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
@@ -4840,7 +4834,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
// For f16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
+ if (!AllowInaccurateRcp && (!ResTy.isScalar(16) ||
!MI.getFlag(MachineInstr::FmArcp)))
return false;
@@ -5154,7 +5148,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT Ty = MRI.getType(Res0);
- LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
+ LLT InstrExpTy = Ty.isScalar(16) ? LLT::scalar(16) : LLT::scalar(32);
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
@@ -5401,11 +5395,11 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
return legalizeFSQRTF32(MI, MRI, B);
- if (Ty == LLT::scalar(64))
+ if (Ty.isScalar(64))
return legalizeFSQRTF64(MI, MRI, B);
- if (Ty == LLT::scalar(16))
+ if (Ty.isScalar(16))
return legalizeFSQRTF16(MI, MRI, B);
return false;
}
@@ -5429,9 +5423,9 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
LLT Ty = MRI.getType(Dst);
const fltSemantics *FltSemantics;
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
FltSemantics = &APFloat::IEEEsingle();
- else if (Ty == LLT::scalar(64))
+ else if (Ty.isScalar(64))
FltSemantics = &APFloat::IEEEdouble();
else
return false;
@@ -5777,7 +5771,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
- assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
+ assert(StoreVT.isVector() && StoreVT.getElementType().isScalar(16));
if (ST.hasUnpackedD16VMem()) {
auto Unmerge = B.buildUnmerge(S16, Reg);
@@ -5826,7 +5820,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
llvm_unreachable("invalid data type");
}
- if (StoreVT == LLT::fixed_vector(3, S16)) {
+ if (StoreVT.isFixedVector(3, 16)) {
Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
.getReg(0);
}
@@ -5839,8 +5833,6 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
MachineRegisterInfo *MRI = B.getMRI();
LLT Ty = MRI->getType(VData);
- const LLT S16 = LLT::scalar(16);
-
// Fixup buffer resources themselves needing to be v4i128.
if (hasBufferRsrcWorkaround(Ty))
return castBufferRsrcToV4I32(VData, B);
@@ -5850,13 +5842,13 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
VData = B.buildBitcast(Ty, VData).getReg(0);
}
// Fixup illegal register types for i8 stores.
- if (Ty == LLT::scalar(8) || Ty == S16) {
+ if (Ty.isScalar(8) || Ty.isScalar(16)) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
return AnyExt;
}
if (Ty.isVector()) {
- if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
+ if (Ty.getElementType().isScalar(16) && Ty.getNumElements() <= 4) {
if (IsFormat)
return handleD16VData(B, *MRI, VData);
}
@@ -6315,7 +6307,7 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
(I >= Intr->CoordStart && !IsA16)) {
if ((I < Intr->GradientStart) && IsA16 &&
- (B.getMRI()->getType(AddrReg) == S16)) {
+ (B.getMRI()->getType(AddrReg).isScalar(16))) {
assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
@@ -6365,7 +6357,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
if (SrcOp.isReg()) {
AddrRegs.push_back(SrcOp.getReg());
- assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
+ assert(B.getMRI()->getType(SrcOp.getReg()).isScalar(32));
}
}
@@ -6435,9 +6427,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT AddrTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 =
- ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
- const bool IsA16 = AddrTy == S16;
- const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
+ ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isScalar(16)) : GradTy.isScalar(16);
+ const bool IsA16 = AddrTy.isScalar(16);
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isScalar(16);
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6684,14 +6676,14 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsTFE) {
Dst1Reg = MI.getOperand(1).getReg();
- if (MRI->getType(Dst1Reg) != S32)
+ if (!MRI->getType(Dst1Reg).isScalar(32))
return false;
// TODO: Make sure the TFE operand bit is set.
MI.removeOperand(1);
// Handle the easy case that requires no repack instructions.
- if (Ty == S32) {
+ if (Ty.isScalar(32)) {
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
return true;
}
@@ -6726,7 +6718,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
// Avoid a build/concat_vector of 1 entry.
- if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
+ if (Ty.isFixedVector(2, 16) && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
B.buildBitcast(DstReg, ResultRegs[0]);
return true;
}
@@ -6739,7 +6731,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
//
// TODO: We don't really need to use load s32 elements. We would only need one
// cast for the TFE result if a multiple of v2s16 was used.
- if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
+ if (!RegTy.isFixedVector(2, 16) && !ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
Reg = B.buildBitcast(V2S16, Reg).getReg(0);
} else if (ST.hasUnpackedD16VMem()) {
@@ -6764,12 +6756,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return true;
}
- assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
+ assert(!ST.hasUnpackedD16VMem() && ResTy.isFixedVector(2, 16));
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
// Deal with the one annoying legal case.
- const LLT V3S16 = LLT::fixed_vector(3, 16);
- if (Ty == V3S16) {
+ if (Ty.isFixedVector(3, 16)) {
if (IsTFE) {
if (ResultRegs.size() == 1) {
NewResultReg = ResultRegs[0];
@@ -7228,7 +7219,7 @@ bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
- if (MRI.getType(Src) != S64)
+ if (!MRI.getType(Src).isScalar(64))
return false;
auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
@@ -7536,7 +7527,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
+ if (!MRI.getType(Index).isScalar(32))
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
@@ -7545,7 +7536,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
+ if (!MRI.getType(Index).isScalar(32))
MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 888817e52e35d4..107d0f8c495032 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -213,7 +213,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
LLT Ty = MRI.getType(DstReg);
- if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
+ if (Ty.isScalar(32) || Ty.isScalar(16)) {
Register SrcReg = MI.getOperand(1).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
@@ -232,10 +232,10 @@ void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
- if (SrcTy != S32)
+ if (!SrcTy.isScalar(32))
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
- if (Ty == S32) {
+ if (Ty.isScalar(32)) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
MI.getFlags());
} else {
@@ -349,7 +349,7 @@ void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
const LLT S32 = LLT::scalar(32);
Register CvtSrc = MatchInfo.CvtVal;
LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
- if (SrcTy != S32) {
+ if (!SrcTy.isScalar(32)) {
assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
}
@@ -418,7 +418,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
MachineInstr &MI, unsigned &NewOpcode) const {
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
- if (MRI.getType(Src0) != LLT::scalar(64))
+ if (!MRI.getType(Src0).isScalar(64))
return false;
if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 52c6e5274ae5b7..cf742511f916ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -119,11 +119,11 @@ bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
// Try to find a pattern where an i64 value should get clamped to short.
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
- if (SrcType != LLT::scalar(64))
+ if (!SrcType.isScalar(64))
return false;
const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- if (DstType != LLT::scalar(16))
+ if (!DstType.isScalar(16))
return false;
Register Base;
@@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
Register Src = MatchInfo.Origin;
- assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
- LLT::scalar(64));
+ assert(MI.getParent()->getParent()->getRegInfo().getType(Src).isScalar(64));
const LLT S32 = LLT::scalar(32);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 98c48f4fe3705b..68312ef657af3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -195,7 +195,7 @@ bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
// med3 for i16 is only available on gfx9+, and not available for v2i16.
LLT Ty = MRI.getType(Dst);
- if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
+ if ((!Ty.isScalar(16) || !STI.hasMed3_16()) && !Ty.isScalar(32))
return false;
MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
@@ -238,7 +238,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
LLT Ty = MRI.getType(Dst);
// med3 for f16 is only available on gfx9+, and not available for v2f16.
- if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
+ if ((!Ty.isScalar(16) || !STI.hasMed3_16()) && !Ty.isScalar(32))
return false;
auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 224c368cff4a1f..9ffb23501064fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -131,8 +131,8 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
if (SrcBank == &AMDGPU::VCCRegBank) {
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(SrcReg) == LLT::scalar(1));
- assert(MRI.getType(DstReg) == S32);
+ assert(MRI.getType(SrcReg).isScalar(1));
+ assert(MRI.getType(DstReg).isScalar(32));
assert(NewBank == &AMDGPU::VGPRRegBank);
// Replace the extension with a select, which really uses the boolean
@@ -170,7 +170,7 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
continue;
const RegisterBank *RB = NewBank;
- if (MRI.getType(Reg) == LLT::scalar(1)) {
+ if (MRI.getType(Reg).isScalar(1)) {
assert(NewBank == &AMDGPU::VGPRRegBank &&
"s1 operands should only be used for vector bools");
assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
@@ -298,7 +298,7 @@ AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
if (!Ty.isValid())
return AMDGPU::SGPRRegBank;
- return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
+ return Ty.isScalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
}
return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
@@ -1495,7 +1495,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
- if (Ty == S32)
+ if (Ty.isScalar(32))
return true;
// There is no 64-bit vgpr bitfield extract instructions so the operation
@@ -1568,7 +1568,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
// TODO: It might be worth using a pseudo here to avoid scc clobber and
// register class constraints.
- unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
+ unsigned Opc = Ty.isScalar(32) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
@@ -1790,7 +1790,7 @@ Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
const LLT S16 = LLT::scalar(16);
LLT StoreVT = MRI.getType(Reg);
- if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
+ if (!StoreVT.isVector() || !StoreVT.getElementType().isScalar(16))
return Reg;
auto Unmerge = B.buildUnmerge(S16, Reg);
@@ -2213,7 +2213,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_IMPLICIT_DEF: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::scalar(1))
+ if (!DstTy.isScalar(1))
break;
const RegisterBank *DstBank =
@@ -2243,7 +2243,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_PHI: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::scalar(1))
+ if (!DstTy.isScalar(1))
break;
const LLT S32 = LLT::scalar(32);
@@ -2514,7 +2514,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
- if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
+ if (!DstTy.isScalar(16) && !DstTy.isFixedVector(2, 16))
break;
const RegisterBank *DstBank =
@@ -2588,7 +2588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg1 = MI.getOperand(2).getReg();
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
+ assert(MRI.getType(DstReg).isScalar(64) && "This is a special case for s_mul_u64 "
"that handles only 64-bit operands.");
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -2684,7 +2684,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
- if (Ty == S32)
+ if (Ty.isScalar(32))
break;
ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
@@ -2708,7 +2708,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
- if (Ty == S32)
+ if (Ty.isScalar(32))
break;
// We can narrow this more efficiently than Helper can by using ffbh/ffbl
@@ -2776,7 +2776,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
- if (SrcTy != LLT::scalar(1))
+ if (!SrcTy.isScalar(1))
return;
// It is not legal to have a legalization artifact with a VCC source. Rather
@@ -3788,9 +3788,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// we need to take the virtual register's type as a hint on how to interpret
// s1 values.
if (!SrcReg.isVirtual() && !DstBank &&
- MRI.getType(DstReg) == LLT::scalar(1))
+ MRI.getType(DstReg).isScalar(1))
DstBank = &AMDGPU::VCCRegBank;
- else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
+ else if (!DstReg.isVirtual() && MRI.getType(SrcReg).isScalar(1))
DstBank = &AMDGPU::VCCRegBank;
if (!DstBank)
@@ -4154,7 +4154,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- if (DstTy == LLT::fixed_vector(2, 16)) {
+ if (DstTy.isFixedVector(2, 16)) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
>From f8fe50e2e18b40d422d3e9f6d317902054eebe63 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 5 Dec 2024 14:29:20 +0000
Subject: [PATCH 04/12] fix MIRBuilder
---
llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index be347006a81f92..adfec6f35d4757 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -563,7 +563,7 @@ MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc,
Op.getLLTTy(*getMRI()).getSizeInBits())
Opcode = TargetOpcode::G_TRUNC;
else
- assert(Res.getLLTTy(*getMRI()) == Op.getLLTTy(*getMRI()));
+ assert(Res.getLLTTy(*getMRI()).getSizeInBits() == Op.getLLTTy(*getMRI()).getSizeInBits());
return buildInstr(Opcode, Res, Op);
}
>From 69b5f8849ea651e620ddce3ea38da6fdb31a92d8 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 5 Dec 2024 13:50:38 +0000
Subject: [PATCH 05/12] FPInfo: IRTranslator and CallLowering
---
llvm/include/llvm/CodeGen/Analysis.h | 2 +-
.../llvm/CodeGen/GlobalISel/IRTranslator.h | 4 +-
llvm/lib/CodeGen/Analysis.cpp | 8 +-
llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 23 +++--
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 95 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 37 ++++++--
6 files changed, 100 insertions(+), 69 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 362cc30bbd06a1..837a41437d517d 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -95,7 +95,7 @@ inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
/// with the in-memory offsets of each of the individual values.
///
void computeValueLLTs(const DataLayout &DL, Type &Ty,
- SmallVectorImpl<LLT> &ValueTys,
+ SmallVectorImpl<LLT> &ValueTys, bool EnableFPInfo,
SmallVectorImpl<uint64_t> *Offsets = nullptr,
uint64_t StartingOffset = 0);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 6fd05c8fddd5f8..983758e3065604 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -619,6 +619,8 @@ class IRTranslator : public MachineFunctionPass {
CodeGenOptLevel OptLevel;
+ bool EnableFPInfo;
+
/// Current optimization remark emitter. Used to report failures.
std::unique_ptr<OptimizationRemarkEmitter> ORE;
@@ -772,7 +774,7 @@ class IRTranslator : public MachineFunctionPass {
BranchProbability Prob = BranchProbability::getUnknown());
public:
- IRTranslator(CodeGenOptLevel OptLevel = CodeGenOptLevel::None);
+ IRTranslator(CodeGenOptLevel OptLevel = CodeGenOptLevel::None, bool EnableFPInfo = false);
StringRef getPassName() const override { return "IRTranslator"; }
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e7b9417de8c9f7..7a433354cdfaac 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -139,7 +139,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
}
void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
- SmallVectorImpl<LLT> &ValueTys,
+ SmallVectorImpl<LLT> &ValueTys, bool EnableFPInfo,
SmallVectorImpl<uint64_t> *Offsets,
uint64_t StartingOffset) {
// Given a struct type, recursively traverse the elements.
@@ -150,7 +150,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I) {
uint64_t EltOffset = SL ? SL->getElementOffset(I) : 0;
- computeValueLLTs(DL, *STy->getElementType(I), ValueTys, Offsets,
+ computeValueLLTs(DL, *STy->getElementType(I), ValueTys, EnableFPInfo, Offsets,
StartingOffset + EltOffset);
}
return;
@@ -160,7 +160,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
Type *EltTy = ATy->getElementType();
uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue();
for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
- computeValueLLTs(DL, *EltTy, ValueTys, Offsets,
+ computeValueLLTs(DL, *EltTy, ValueTys, EnableFPInfo, Offsets,
StartingOffset + i * EltSize);
return;
}
@@ -168,7 +168,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
if (Ty.isVoidTy())
return;
// Base case: we can get an LLT for this LLVM IR type.
- ValueTys.push_back(getLLTForType(Ty, DL));
+ ValueTys.push_back(getLLTForType(Ty, DL, EnableFPInfo));
if (Offsets != nullptr)
Offsets->push_back(StartingOffset * 8);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index d17b20d977ce99..32702ee465fb49 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -158,7 +158,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
if (const Function *F = dyn_cast<Function>(CalleeV)) {
if (F->hasFnAttribute(Attribute::NonLazyBind)) {
- LLT Ty = getLLTForType(*F->getType(), DL);
+ LLT Ty = getLLTForType(*F->getType(), DL, /* EnableFPInfo */ true);
Register Reg = MIRBuilder.buildGlobalValue(Ty, F).getReg(0);
Info.Callee = MachineOperand::CreateReg(Reg, false);
} else {
@@ -780,11 +780,11 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
const MVT ValVT = VA.getValVT();
const MVT LocVT = VA.getLocVT();
- const LLT LocTy(LocVT);
- const LLT ValTy(ValVT);
+ const LLT LocTy(LocVT, /* EnableFPInfo */ true);
+ const LLT ValTy(ValVT, /* EnableFPInfo */ true);
const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : ValTy;
const EVT OrigVT = EVT::getEVT(Args[i].Ty);
- const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
+ const LLT OrigTy = getLLTForType(*Args[i].Ty, DL, /* EnableFPInfo */ true);
const LLT PointerTy = LLT::pointer(
AllocaAddressSpace, DL.getPointerSizeInBits(AllocaAddressSpace));
@@ -822,8 +822,11 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
if (!Handler.isIncomingArgumentHandler() && OrigTy != ValTy &&
VA.getLocInfo() != CCValAssign::Indirect) {
assert(Args[i].OrigRegs.size() == 1);
+ unsigned ExtendOp = extendOpFromFlags(Args[i].Flags[0]);
+ if (OrigTy.isFloat() && ValTy.isFloat())
+ ExtendOp = TargetOpcode::G_FPEXT;
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
- ValTy, extendOpFromFlags(Args[i].Flags[0]));
+ ValTy, ExtendOp);
}
bool IndirectParameterPassingHandled = false;
@@ -1003,7 +1006,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
Type *RetPtrTy =
PointerType::get(RetTy->getContext(), DL.getAllocaAddrSpace());
- LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL);
+ LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL, /* EnableFPInfo */ true);
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -1033,7 +1036,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
unsigned NumValues = SplitVTs.size();
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
unsigned AS = DL.getAllocaAddrSpace();
- LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getContext(), AS), DL);
+ LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getContext(), AS), DL, /* EnableFPInfo */ true);
MachinePointerInfo PtrInfo(AS);
@@ -1291,8 +1294,8 @@ void CallLowering::ValueHandler::copyArgumentMemory(
Register CallLowering::ValueHandler::extendRegister(Register ValReg,
const CCValAssign &VA,
unsigned MaxSizeBits) {
- LLT LocTy{VA.getLocVT()};
- LLT ValTy{VA.getValVT()};
+ LLT LocTy(VA.getLocVT(), /* EnableFPInfo */ true);
+ LLT ValTy(VA.getValVT(), /* EnableFPInfo */ true);
if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
return ValReg;
@@ -1383,7 +1386,7 @@ static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
void CallLowering::IncomingValueHandler::assignValueToReg(
Register ValVReg, Register PhysReg, const CCValAssign &VA) {
const MVT LocVT = VA.getLocVT();
- const LLT LocTy(LocVT);
+ const LLT LocTy(LocVT, true);
const LLT RegTy = MRI.getType(ValVReg);
if (isCopyCompatibleType(RegTy, LocTy)) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f668e41094bbc8..28041033e6df57 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -129,8 +129,8 @@ static void reportTranslationError(MachineFunction &MF,
ORE.emit(R);
}
-IRTranslator::IRTranslator(CodeGenOptLevel optlevel)
- : MachineFunctionPass(ID), OptLevel(optlevel) {}
+IRTranslator::IRTranslator(CodeGenOptLevel optlevel, bool EnableFPInfo)
+ : MachineFunctionPass(ID), OptLevel(optlevel), EnableFPInfo(EnableFPInfo) {}
#ifndef NDEBUG
namespace {
@@ -194,7 +194,7 @@ IRTranslator::allocateVRegs(const Value &Val) {
auto *Regs = VMap.getVRegs(Val);
auto *Offsets = VMap.getOffsets(Val);
SmallVector<LLT, 4> SplitTys;
- computeValueLLTs(*DL, *Val.getType(), SplitTys,
+ computeValueLLTs(*DL, *Val.getType(), SplitTys, EnableFPInfo,
Offsets->empty() ? Offsets : nullptr);
for (unsigned i = 0; i < SplitTys.size(); ++i)
Regs->push_back(0);
@@ -218,7 +218,7 @@ ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
"Don't know how to create an empty vreg");
SmallVector<LLT, 4> SplitTys;
- computeValueLLTs(*DL, *Val.getType(), SplitTys,
+ computeValueLLTs(*DL, *Val.getType(), SplitTys, EnableFPInfo,
Offsets->empty() ? Offsets : nullptr);
if (!isa<Constant>(Val)) {
@@ -840,7 +840,7 @@ void IRTranslator::emitJumpTable(SwitchCG::JumpTable &JT,
MIB.setDebugLoc(CurBuilder->getDebugLoc());
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
auto Table = MIB.buildJumpTable(PtrTy, JT.JTI);
MIB.buildBrJT(Table.getReg(0), JT.JTI, JT.Reg);
@@ -855,7 +855,7 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
const Value &SValue = *JTH.SValue;
// Subtract the lowest switch case value from the value being switched on.
- const LLT SwitchTy = getLLTForType(*SValue.getType(), *DL);
+ const LLT SwitchTy = getLLTForType(*SValue.getType(), *DL, EnableFPInfo);
Register SwitchOpReg = getOrCreateVReg(SValue);
auto FirstCst = MIB.buildConstant(SwitchTy, JTH.First);
auto Sub = MIB.buildSub({SwitchTy}, SwitchOpReg, FirstCst);
@@ -863,7 +863,7 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
// This value may be smaller or larger than the target's pointer type, and
// therefore require extension or truncating.
auto *PtrIRTy = PointerType::getUnqual(SValue.getContext());
- const LLT PtrScalarTy = LLT::scalar(DL->getTypeSizeInBits(PtrIRTy));
+ const LLT PtrScalarTy = LLT::integer(DL->getTypeSizeInBits(PtrIRTy));
Sub = MIB.buildZExtOrTrunc(PtrScalarTy, Sub);
JT.Reg = Sub.getReg(0);
@@ -880,7 +880,8 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
auto Cst = getOrCreateVReg(
*ConstantInt::get(SValue.getType(), JTH.Last - JTH.First));
Cst = MIB.buildZExtOrTrunc(PtrScalarTy, Cst).getReg(0);
- auto Cmp = MIB.buildICmp(CmpInst::ICMP_UGT, LLT::scalar(1), Sub, Cst);
+ LLT CmpTy = LLT::integer(1);
+ auto Cmp = MIB.buildICmp(CmpInst::ICMP_UGT, CmpTy, Sub, Cst);
auto BrCond = MIB.buildBrCond(Cmp.getReg(0), *JT.Default);
@@ -911,7 +912,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
return;
}
- const LLT i1Ty = LLT::scalar(1);
+ const LLT i1Ty = LLT::integer(1);
// Build the compare.
if (!CB.CmpMHS) {
const auto *CI = dyn_cast<ConstantInt>(CB.CmpRHS);
@@ -1088,19 +1089,19 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
auto RangeSub = MIB.buildSub(SwitchOpTy, SwitchOpReg, MinValReg);
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
LLT MaskTy = SwitchOpTy;
if (MaskTy.getSizeInBits() > PtrTy.getSizeInBits() ||
!llvm::has_single_bit<uint32_t>(MaskTy.getSizeInBits()))
- MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+ MaskTy = LLT::integer(PtrTy.getSizeInBits());
else {
// Ensure that the type will fit the mask value.
for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
// Switch table case range are encoded into series of masks.
// Just use pointer type, it's guaranteed to fit.
- MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+ MaskTy = LLT::integer(PtrTy.getSizeInBits());
break;
}
}
@@ -1109,7 +1110,7 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
if (SwitchOpTy != MaskTy)
SubReg = MIB.buildZExtOrTrunc(MaskTy, SubReg).getReg(0);
- B.RegVT = getMVTForLLT(MaskTy);
+ B.RegVT = getMVTForLLT(MaskTy, EnableFPInfo);
B.Reg = SubReg;
MachineBasicBlock *MBB = B.Cases[0].ThisBB;
@@ -1123,7 +1124,8 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
if (!B.FallthroughUnreachable) {
// Conditional branch to the default block.
auto RangeCst = MIB.buildConstant(SwitchOpTy, B.Range);
- auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, LLT::scalar(1),
+ LLT CmpTy = LLT::integer(1);
+ auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, CmpTy,
RangeSub, RangeCst);
MIB.buildBrCond(RangeCmp, *B.Default);
}
@@ -1141,7 +1143,8 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
MachineIRBuilder &MIB = *CurBuilder;
MIB.setMBB(*SwitchBB);
- LLT SwitchTy = getLLTForMVT(BB.RegVT);
+ LLT SwitchTy = getLLTForMVT(BB.RegVT, EnableFPInfo);
+ LLT I1 = LLT::integer(1);
Register Cmp;
unsigned PopCount = llvm::popcount(B.Mask);
if (PopCount == 1) {
@@ -1150,13 +1153,13 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
auto MaskTrailingZeros =
MIB.buildConstant(SwitchTy, llvm::countr_zero(B.Mask));
Cmp =
- MIB.buildICmp(ICmpInst::ICMP_EQ, LLT::scalar(1), Reg, MaskTrailingZeros)
+ MIB.buildICmp(ICmpInst::ICMP_EQ, I1, Reg, MaskTrailingZeros)
.getReg(0);
} else if (PopCount == BB.Range) {
// There is only one zero bit in the range, test for it directly.
auto MaskTrailingOnes =
MIB.buildConstant(SwitchTy, llvm::countr_one(B.Mask));
- Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Reg, MaskTrailingOnes)
+ Cmp = MIB.buildICmp(CmpInst::ICMP_NE, I1, Reg, MaskTrailingOnes)
.getReg(0);
} else {
// Make desired shift.
@@ -1167,7 +1170,7 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
auto CstMask = MIB.buildConstant(SwitchTy, B.Mask);
auto AndOp = MIB.buildAnd(SwitchTy, SwitchVal, CstMask);
auto CstZero = MIB.buildConstant(SwitchTy, 0);
- Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), AndOp, CstZero)
+ Cmp = MIB.buildICmp(CmpInst::ICMP_NE, I1, AndOp, CstZero)
.getReg(0);
}
@@ -1368,7 +1371,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
const Value *Ptr = LI.getPointerOperand();
Type *OffsetIRTy = DL->getIndexType(Ptr->getType());
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
if (CLI->supportSwiftError() && isSwiftError(Ptr)) {
assert(Regs.size() == 1 && "swifterror should be single pointer");
@@ -1415,7 +1418,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
Register Base = getOrCreateVReg(*SI.getPointerOperand());
Type *OffsetIRTy = DL->getIndexType(SI.getPointerOperandType());
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
if (CLI->supportSwiftError() && isSwiftError(SI.getPointerOperand())) {
assert(Vals.size() == 1 && "swifterror should be single pointer");
@@ -1538,8 +1541,8 @@ bool IRTranslator::translateCopy(const User &U, const Value &V,
bool IRTranslator::translateBitCast(const User &U,
MachineIRBuilder &MIRBuilder) {
// If we're bitcasting to the source type, we can reuse the source vreg.
- if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
- getLLTForType(*U.getType(), *DL)) {
+ if (getLLTForType(*U.getOperand(0)->getType(), *DL, EnableFPInfo) ==
+ getLLTForType(*U.getType(), *DL, EnableFPInfo)) {
// If the source is a ConstantInt then it was probably created by
// ConstantHoisting and we should leave it alone.
if (isa<ConstantInt>(U.getOperand(0)))
@@ -1572,9 +1575,9 @@ bool IRTranslator::translateGetElementPtr(const User &U,
Value &Op0 = *U.getOperand(0);
Register BaseReg = getOrCreateVReg(Op0);
Type *PtrIRTy = Op0.getType();
- LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
Type *OffsetIRTy = DL->getIndexType(PtrIRTy);
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
uint32_t Flags = 0;
if (const Instruction *I = dyn_cast<Instruction>(&U))
@@ -1601,9 +1604,9 @@ bool IRTranslator::translateGetElementPtr(const User &U,
BaseReg)
.getReg(0);
PtrIRTy = FixedVectorType::get(PtrIRTy, VectorWidth);
- PtrTy = getLLTForType(*PtrIRTy, *DL);
+ PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
OffsetIRTy = DL->getIndexType(PtrIRTy);
- OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
}
int64_t Offset = 0;
@@ -1651,7 +1654,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
Register GepOffsetReg;
if (ElementSize != 1) {
auto ElementSizeMIB = MIRBuilder.buildConstant(
- getLLTForType(*OffsetIRTy, *DL), ElementSize);
+ getLLTForType(*OffsetIRTy, *DL, EnableFPInfo), ElementSize);
GepOffsetReg =
MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB).getReg(0);
} else
@@ -1696,7 +1699,7 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
SrcRegs.push_back(SrcReg);
}
- LLT SizeTy = LLT::scalar(MinPtrSize);
+ LLT SizeTy = LLT::integer(MinPtrSize);
// The size operand should be the minimum of the pointer sizes.
Register &SizeOpReg = SrcRegs[SrcRegs.size() - 1];
@@ -2313,7 +2316,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
MachineInstr::copyFlagsFromInstruction(CI));
} else {
- LLT Ty = getLLTForType(*CI.getType(), *DL);
+ LLT Ty = getLLTForType(*CI.getType(), *DL, EnableFPInfo);
auto FMul = MIRBuilder.buildFMul(
Ty, Op0, Op1, MachineInstr::copyFlagsFromInstruction(CI));
MIRBuilder.buildFAdd(Dst, FMul, Op2,
@@ -2380,7 +2383,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getStackGuard(getOrCreateVReg(CI), MIRBuilder);
return true;
case Intrinsic::stackprotector: {
- LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+ LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL, EnableFPInfo);
Register GuardVal;
if (TLI->useLoadStackGuardNode(*CI.getModule())) {
GuardVal = MRI->createGenericVirtualRegister(PtrTy);
@@ -2423,7 +2426,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
return true;
}
case Intrinsic::invariant_start: {
- LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+ LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL, EnableFPInfo);
Register Undef = MRI->createGenericVirtualRegister(PtrTy);
MIRBuilder.buildUndef(Undef);
return true;
@@ -2622,7 +2625,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
case Intrinsic::vector_deinterleave2: {
// Both intrinsics have at least one operand.
Value *Op0 = CI.getOperand(0);
- LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
+ LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout(), EnableFPInfo);
if (!ResTy.isFixedVector())
return false;
@@ -2670,7 +2673,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
for (const auto &Arg : CB.args()) {
if (CLI->supportSwiftError() && isSwiftError(Arg)) {
assert(SwiftInVReg == 0 && "Expected only one swift error argument");
- LLT Ty = getLLTForType(*Arg->getType(), *DL);
+ LLT Ty = getLLTForType(*Arg->getType(), *DL, EnableFPInfo);
SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
&CB, &MIRBuilder.getMBB(), Arg));
@@ -2823,8 +2826,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
Align Alignment = Info.align.value_or(
DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
LLT MemTy = Info.memVT.isSimple()
- ? getLLTForMVT(Info.memVT.getSimpleVT())
- : LLT::scalar(Info.memVT.getStoreSizeInBits());
+ ? getLLTForMVT(Info.memVT.getSimpleVT(), EnableFPInfo)
+ : LLT::integer(Info.memVT.getStoreSizeInBits());
// TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
// didn't yield anything useful.
@@ -3030,13 +3033,13 @@ bool IRTranslator::translateLandingPad(const User &U,
if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
- LLT Ty = getLLTForType(*LP.getType(), *DL);
+ LLT Ty = getLLTForType(*LP.getType(), *DL, EnableFPInfo);
Register Undef = MRI->createGenericVirtualRegister(Ty);
MIRBuilder.buildUndef(Undef);
SmallVector<LLT, 2> Tys;
for (Type *Ty : cast<StructType>(LP.getType())->elements())
- Tys.push_back(getLLTForType(*Ty, *DL));
+ Tys.push_back(getLLTForType(*Ty, *DL, EnableFPInfo));
assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
// Mark exception register as live in.
@@ -3170,7 +3173,7 @@ bool IRTranslator::translateInsertElement(const User &U,
if (!Idx)
Idx = getOrCreateVReg(*U.getOperand(2));
if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
- const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ const LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
}
MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
@@ -3213,7 +3216,7 @@ bool IRTranslator::translateInsertVector(const User &U,
if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
// We are inserting an illegal fixed vector into a scalable
// vector, use a scalar element insert.
- LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Register Idx = getOrCreateVReg(*CI);
auto ScaledIndex = MIRBuilder.buildMul(
VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
@@ -3251,7 +3254,7 @@ bool IRTranslator::translateExtractElement(const User &U,
if (!Idx)
Idx = getOrCreateVReg(*U.getOperand(1));
if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
- const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ const LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
}
MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
@@ -3291,7 +3294,7 @@ bool IRTranslator::translateExtractVector(const User &U,
if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
// We are extracting an illegal fixed vector from a scalable
// vector, use a scalar element extract.
- LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Register Idx = getOrCreateVReg(*CI);
auto ScaledIndex = MIRBuilder.buildMul(
VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
@@ -3819,8 +3822,8 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
// First create the loads to the guard/stack slot for the comparison.
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
- LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
+ LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL), EnableFPInfo);
MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
int FI = MFI.getStackProtectorIndex();
@@ -3880,8 +3883,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
// If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
// Otherwise, emit a volatile load to retrieve the stack guard value.
if (TLI->useLoadStackGuardNode(*ParentBB->getBasicBlock()->getModule())) {
+ LLT RegTy = LLT::integer(PtrTy.getSizeInBits());
Guard =
- MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
+ MRI->createGenericVirtualRegister(RegTy);
getStackGuard(Guard, *CurBuilder);
} else {
// TODO: test using android subtarget when we support @llvm.thread.pointer.
@@ -3897,8 +3901,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
}
// Perform the comparison.
+ LLT I1 = LLT::integer(1);
auto Cmp =
- CurBuilder->buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Guard, GuardVal);
+ CurBuilder->buildICmp(CmpInst::ICMP_NE, I1, Guard, GuardVal);
// If the guard/stackslot do not equal, branch to failure MBB.
CurBuilder->buildBrCond(Cmp, *SPD.getFailureMBB());
// Otherwise branch to success MBB.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 666615202a4b5f..f3e0c24796599b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -32,10 +32,19 @@ namespace {
/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
Register ValVReg, const CCValAssign &VA) {
- if (VA.getLocVT().getSizeInBits() < 32) {
+ LLT SrcTy = LLT(VA.getLocVT(), /*EnableFPInfo*/ true);
+
+ if (SrcTy.getSizeInBits() < 32) {
+ LLT I32 = LLT::integer(32);
+ LLT DstTy = LLT::integer(SrcTy.getSizeInBits());
+
+ Register SrcReg = ValVReg;
+ if (SrcTy.isFloat())
+ SrcReg = Handler.MIRBuilder.buildBitcast(DstTy, ValVReg).getReg(0);
+
// 16-bit types are reported as legal for 32-bit registers. We need to
// extend and do a 32-bit copy to avoid the verifier complaining about it.
- return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+ return Handler.MIRBuilder.buildAnyExt(I32, SrcReg).getReg(0);
}
return Handler.extendRegister(ValVReg, VA);
@@ -119,16 +128,28 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
+ LLT LocTy = LLT(VA.getLocVT(), /* EnableFPInfo */ true);
- if (VA.getLocVT().getSizeInBits() < 32) {
+ if (LocTy.getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ Register CopyReg = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg).getReg(0);
+
+ if (LocTy.getScalarType().isFloat()) {
+ LLT TruncTy = LocTy.isVector()
+ ? LLT::vector(LocTy.getElementCount(),
+ LLT::integer(LocTy.getScalarSizeInBits()))
+ : LLT::integer(LocTy.getScalarSizeInBits());
+
+ auto Extended = buildExtensionHint(VA, CopyReg, TruncTy);
+ auto Trunc = MIRBuilder.buildTrunc(TruncTy, Extended);
+ MIRBuilder.buildBitcast(ValVReg, Trunc.getReg(0));
+ return;
+ }
// If we have signext/zeroext, it applies to the whole 32-bit register
// before truncation.
- auto Extended =
- buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
+ auto Extended = buildExtensionHint(VA, CopyReg, LocTy);
MIRBuilder.buildTrunc(ValVReg, Extended);
return;
}
@@ -332,7 +353,7 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
extOpcodeToISDExtOpcode(ExtendOp));
if (ExtVT != VT) {
RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
- LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
+ LLT ExtTy = getLLTForType(*RetInfo.Ty, DL, /* EnableFPInfo */ true);
Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
}
}
@@ -422,7 +443,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
- LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
+ LLT ArgTy = getLLTForType(*SplitArg.Ty, DL, /* EnableFPInfo */ true);
if (SplitArg.Flags[0].isPointer()) {
// Compensate for losing pointeriness in splitValueTypes.
LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
>From 0f1d0d9cb84f28245ef41d9fdab60c810a49b7cf Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 11 Dec 2024 12:39:18 +0000
Subject: [PATCH 06/12] re-enable bfloat
---
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 28041033e6df57..f196dd8dc8f10c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1556,10 +1556,6 @@ bool IRTranslator::translateBitCast(const User &U,
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
- if (U.getType()->getScalarType()->isBFloatTy() ||
- U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
- return false;
-
uint32_t Flags = 0;
if (const Instruction *I = dyn_cast<Instruction>(&U))
Flags = MachineInstr::copyFlagsFromInstruction(*I);
>From 0e65719f65afcb1f69ac2dc46e5a0f4a5722f779 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 11 Dec 2024 12:40:56 +0000
Subject: [PATCH 07/12] temp patch float -> integer
---
llvm/include/llvm/CodeGenTypes/LowLevelType.h | 12 ++++++------
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 ++++++--------
2 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index cf5f740c364d39..d42c4a5ed01fdb 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -139,25 +139,25 @@ class LLT {
}
// Get a 16-bit brain float value.
- static constexpr LLT bfloat() { return integer(16); }
+ static constexpr LLT bfloat() { return floatingPoint(16, FPInfo::VARIANT_FLOAT_1); }
/// Get a 16-bit IEEE half value.
- static constexpr LLT float16() { return integer(16); }
+ static constexpr LLT float16() { return floatingPoint(16, FPInfo::IEEE_FLOAT); }
/// Get a 32-bit IEEE float value.
- static constexpr LLT float32() { return integer(32); }
+ static constexpr LLT float32() { return floatingPoint(32, FPInfo::IEEE_FLOAT); }
/// Get a 64-bit IEEE double value.
- static constexpr LLT float64() { return integer(64); }
+ static constexpr LLT float64() { return floatingPoint(64, FPInfo::IEEE_FLOAT); }
/// Get a 80-bit X86 floating point value.
- static constexpr LLT x86fp80() { return integer(80); }
+ static constexpr LLT x86fp80() { return floatingPoint(80, FPInfo::VARIANT_FLOAT_1); }
/// Get a 128-bit IEEE quad value.
static constexpr LLT float128() { return floatingPoint(128, FPInfo::IEEE_FLOAT); }
/// Get a 128-bit PowerPC double double value.
- static constexpr LLT ppcf128() { return integer(128); }
+ static constexpr LLT ppcf128() { return floatingPoint(128, FPInfo::VARIANT_FLOAT_1); }
/// Get a low-level fixed-width vector of some number of elements and element
/// width.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 439cc78ed705e8..c358641cdef170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -282,10 +282,11 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
+static const LLT F16 = LLT::integer(16);
static const LLT S32 = LLT::scalar(32);
-static const LLT F32 = LLT::float32();
+static const LLT F32 = LLT::integer(32);
static const LLT S64 = LLT::scalar(64);
-static const LLT F64 = LLT::float64();
+static const LLT F64 = LLT::integer(64);
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -305,7 +306,7 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16);
static const LLT V12S16 = LLT::fixed_vector(12, 16);
static const LLT V16S16 = LLT::fixed_vector(16, 16);
-static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
+static const LLT V2F16 = LLT::fixed_vector(2, LLT::integer(16));
static const LLT V2BF16 = V2F16; // FIXME
static const LLT V2S32 = LLT::fixed_vector(2, 32);
@@ -3198,10 +3199,10 @@ bool AMDGPULegalizerInfo::legalizeFMad(
// TODO: Always legal with future ftz flag.
// FIXME: Do we need just output?
- if (Ty == LLT::float32() &&
+ if (Ty == F32 &&
MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
return true;
- if (Ty == LLT::float16() &&
+ if (Ty == F16 &&
MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
return true;
@@ -3753,8 +3754,6 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
Register Src1 = MI.getOperand(2).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
- const LLT F16 = LLT::float16();
- const LLT F32 = LLT::float32();
if (Ty == F32) {
auto Log = B.buildFLog2(F32, Src0, Flags);
@@ -3797,7 +3796,6 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
MachineIRBuilder &B) const {
const LLT S1 = LLT::scalar(1);
- const LLT F64 = LLT::float64();
Register Dst = MI.getOperand(0).getReg();
Register OrigSrc = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
>From 2004f21821598a87610d9cb29905b90d3ec59ab4 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Mon, 16 Dec 2024 12:45:09 +0000
Subject: [PATCH 08/12] AMDGPU legalizer WIP
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 19 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1795 +++++++++--------
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 11 +-
3 files changed, 929 insertions(+), 896 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a4239f2567146d..63cb2e6ef92b87 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2012,6 +2012,15 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned ExtOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
+ LLT SrcTy = MRI.getType(MO.getReg());
+
+ if (SrcTy.isFloat() && ExtOpcode != TargetOpcode::G_FPEXT) {
+ auto Cast = MIRBuilder.buildBitcast(SrcTy.dropType(), MO);
+ auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {Cast});
+ MO.setReg(ExtB.getReg(0));
+ return;
+ }
+
auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
MO.setReg(ExtB.getReg(0));
}
@@ -2026,8 +2035,18 @@ void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned TruncOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
+ LLT DstTy = MRI.getType(MO.getReg());
Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+
MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+ if (DstTy.isFloat() && TruncOpcode != TargetOpcode::G_FPTRUNC) {
+ auto Trunc = MIRBuilder.buildInstr(TruncOpcode, {DstTy.dropType()}, {DstExt});
+ MIRBuilder.buildBitcast(MO, Trunc);
+ MO.setReg(DstExt);
+ return;
+ }
+
MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
MO.setReg(DstExt);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c358641cdef170..2075fd0d27dd2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -52,6 +52,142 @@ static cl::opt<bool> EnableNewLegality(
static constexpr unsigned MaxRegisterSize = 1024;
+
+static const LLT I1 = LLT::integer(1);
+static const LLT I8 = LLT::integer(8);
+static const LLT I16 = LLT::integer(16);
+static const LLT F16 = LLT::float16();
+static const LLT BF16 = LLT::bfloat();
+static const LLT I32 = LLT::integer(32);
+static const LLT F32 = LLT::float32();
+static const LLT I64 = LLT::integer(64);
+static const LLT F64 = LLT::float64();
+static const LLT I96 = LLT::integer(96);
+static const LLT I128 = LLT::integer(128);
+static const LLT I160 = LLT::integer(160);
+static const LLT I192 = LLT::integer(192);
+static const LLT I224 = LLT::integer(224);
+static const LLT I256 = LLT::integer(256);
+static const LLT I512 = LLT::integer(512);
+static const LLT I1024 = LLT::integer(1024);
+static const LLT MaxScalar = LLT::integer(MaxRegisterSize);
+
+static const LLT V2I8 = LLT::fixed_vector(2, I8);
+static const LLT V2I16 = LLT::fixed_vector(2, I16);
+static const LLT V4I16 = LLT::fixed_vector(4, I16);
+static const LLT V6I16 = LLT::fixed_vector(6, I16);
+static const LLT V8I16 = LLT::fixed_vector(8, I16);
+static const LLT V10I16 = LLT::fixed_vector(10, I16);
+static const LLT V12I16 = LLT::fixed_vector(12, I16);
+static const LLT V16I16 = LLT::fixed_vector(16, I16);
+
+static const LLT V2F16 = LLT::fixed_vector(2, F16);
+static const LLT V4F16 = LLT::fixed_vector(4, F16);
+static const LLT V6F16 = LLT::fixed_vector(6, F16);
+static const LLT V8F16 = LLT::fixed_vector(8, F16);
+static const LLT V10F16 = LLT::fixed_vector(10, F16);
+static const LLT V12F16 = LLT::fixed_vector(12, F16);
+static const LLT V16F16 = LLT::fixed_vector(16, F16);
+
+static const LLT V2BF16 = LLT::fixed_vector(2, BF16);
+static const LLT V4BF16 = LLT::fixed_vector(4, BF16);
+static const LLT V6BF16 = LLT::fixed_vector(6, BF16);
+static const LLT V8BF16 = LLT::fixed_vector(8, BF16);
+static const LLT V10BF16 = LLT::fixed_vector(10, BF16);
+static const LLT V12BF16 = LLT::fixed_vector(12, BF16);
+static const LLT V16BF16 = LLT::fixed_vector(16, BF16);
+
+static const LLT V2I32 = LLT::fixed_vector(2, I32);
+static const LLT V3I32 = LLT::fixed_vector(3, I32);
+static const LLT V4I32 = LLT::fixed_vector(4, I32);
+static const LLT V5I32 = LLT::fixed_vector(5, I32);
+static const LLT V6I32 = LLT::fixed_vector(6, I32);
+static const LLT V7I32 = LLT::fixed_vector(7, I32);
+static const LLT V8I32 = LLT::fixed_vector(8, I32);
+static const LLT V9I32 = LLT::fixed_vector(9, I32);
+static const LLT V10I32 = LLT::fixed_vector(10, I32);
+static const LLT V11I32 = LLT::fixed_vector(11, I32);
+static const LLT V12I32 = LLT::fixed_vector(12, I32);
+static const LLT V16I32 = LLT::fixed_vector(16, I32);
+static const LLT V32I32 = LLT::fixed_vector(32, I32);
+
+static const LLT V2F32 = LLT::fixed_vector(2, F32);
+static const LLT V3F32 = LLT::fixed_vector(3, F32);
+static const LLT V4F32 = LLT::fixed_vector(4, F32);
+static const LLT V5F32 = LLT::fixed_vector(5, F32);
+static const LLT V6F32 = LLT::fixed_vector(6, F32);
+static const LLT V7F32 = LLT::fixed_vector(7, F32);
+static const LLT V8F32 = LLT::fixed_vector(8, F32);
+static const LLT V9F32 = LLT::fixed_vector(9, F32);
+static const LLT V10F32 = LLT::fixed_vector(10, F32);
+static const LLT V11F32 = LLT::fixed_vector(11, F32);
+static const LLT V12F32 = LLT::fixed_vector(12, F32);
+static const LLT V16F32 = LLT::fixed_vector(16, F32);
+static const LLT V32F32 = LLT::fixed_vector(32, F32);
+
+static const LLT V2I64 = LLT::fixed_vector(2, I64);
+static const LLT V3I64 = LLT::fixed_vector(3, I64);
+static const LLT V4I64 = LLT::fixed_vector(4, I64);
+static const LLT V5I64 = LLT::fixed_vector(5, I64);
+static const LLT V6I64 = LLT::fixed_vector(6, I64);
+static const LLT V7I64 = LLT::fixed_vector(7, I64);
+static const LLT V8I64 = LLT::fixed_vector(8, I64);
+static const LLT V16I64 = LLT::fixed_vector(16, I64);
+
+static const LLT V2F64 = LLT::fixed_vector(2, F64);
+static const LLT V3F64 = LLT::fixed_vector(3, F64);
+static const LLT V4F64 = LLT::fixed_vector(4, F64);
+static const LLT V5F64 = LLT::fixed_vector(5, F64);
+static const LLT V6F64 = LLT::fixed_vector(6, F64);
+static const LLT V7F64 = LLT::fixed_vector(7, F64);
+static const LLT V8F64 = LLT::fixed_vector(8, F64);
+static const LLT V16F64 = LLT::fixed_vector(16, F64);
+
+static const LLT V2I128 = LLT::fixed_vector(2, I128);
+static const LLT V4I128 = LLT::fixed_vector(4, I128);
+
+static std::initializer_list<LLT> AllScalarTypes = {
+ I16, F16, BF16, I32, F32, I64, F64, I96, I128, I160, I192, I224, I256, I512, I1024};
+
+static std::initializer_list<LLT> AllS16Vectors{
+ V2I16, V2F16, V2BF16,
+ V4I16, V4F16, V4BF16,
+ V6I16, V6F16, V6BF16,
+ V8I16, V8F16, V8BF16,
+ V10I16, V10F16, V10BF16,
+ V12I16, V12F16, V12BF16,
+ V16I16, V16F16, V16BF16,
+ V2I128,
+ V4I128,
+};
+
+static std::initializer_list<LLT> AllS32Vectors = {
+ V2I32, V2F32,
+ V3I32, V3F32,
+ V4I32, V4F32,
+ V5I32, V5F32,
+ V6I32, V6F32,
+ V7I32, V7F32,
+ V8I32, V8F32,
+ V9I32, V9F32,
+ V10I32, V10F32,
+ V11I32, V11F32,
+ V12I32, V12F32,
+ V16I32, V16F32,
+ V32I32, V32F32,
+};
+
+static std::initializer_list<LLT> AllS64Vectors = {
+ V2I64, V2F64,
+ V3I64, V3F64,
+ V4I64, V4F64,
+ V5I64, V5F64,
+ V6I64, V6F64,
+ V7I64, V7F64,
+ V8I64, V8F64,
+ V16I64, V16F64,
+};
+
// Round the number of elements to the next power of two elements
static LLT getPow2VectorType(LLT Ty) {
unsigned NElts = Ty.getNumElements();
@@ -60,10 +196,10 @@ static LLT getPow2VectorType(LLT Ty) {
}
// Round the number of bits to the next power of two bits
-static LLT getPow2ScalarType(LLT Ty) {
+static LLT getPow2IntegerType(LLT Ty) {
unsigned Bits = Ty.getSizeInBits();
unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
- return LLT::scalar(Pow2Bits);
+ return LLT::integer(Pow2Bits);
}
/// \returns true if this is an odd sized vector which should widen by adding an
@@ -161,16 +297,16 @@ static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
static LLT getBufferRsrcScalarType(const LLT Ty) {
if (!Ty.isVector())
- return LLT::scalar(128);
+ return I128;
const ElementCount NumElems = Ty.getElementCount();
- return LLT::vector(NumElems, LLT::scalar(128));
+ return LLT::vector(NumElems, I128);
}
static LLT getBufferRsrcRegisterType(const LLT Ty) {
if (!Ty.isVector())
- return LLT::fixed_vector(4, LLT::scalar(32));
+ return V4I32;
const unsigned NumElems = Ty.getElementCount().getFixedValue();
- return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
+ return LLT::fixed_vector(NumElems * 4, I32);
}
static LLT getBitcastRegisterType(const LLT Ty) {
@@ -179,10 +315,10 @@ static LLT getBitcastRegisterType(const LLT Ty) {
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
- return LLT::scalar(Size);
+ return LLT::integer(Size);
}
- return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
+ return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), I32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
@@ -198,7 +334,7 @@ static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
unsigned Size = Ty.getSizeInBits();
assert(Size % 32 == 0);
return std::pair(
- TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
+ TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), I32));
};
}
@@ -279,79 +415,10 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
};
}
-static const LLT S1 = LLT::scalar(1);
-static const LLT S8 = LLT::scalar(8);
-static const LLT S16 = LLT::scalar(16);
-static const LLT F16 = LLT::integer(16);
-static const LLT S32 = LLT::scalar(32);
-static const LLT F32 = LLT::integer(32);
-static const LLT S64 = LLT::scalar(64);
-static const LLT F64 = LLT::integer(64);
-static const LLT S96 = LLT::scalar(96);
-static const LLT S128 = LLT::scalar(128);
-static const LLT S160 = LLT::scalar(160);
-static const LLT S192 = LLT::scalar(192);
-static const LLT S224 = LLT::scalar(224);
-static const LLT S256 = LLT::scalar(256);
-static const LLT S512 = LLT::scalar(512);
-static const LLT S1024 = LLT::scalar(1024);
-static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
-
-static const LLT V2S8 = LLT::fixed_vector(2, 8);
-static const LLT V2S16 = LLT::fixed_vector(2, 16);
-static const LLT V4S16 = LLT::fixed_vector(4, 16);
-static const LLT V6S16 = LLT::fixed_vector(6, 16);
-static const LLT V8S16 = LLT::fixed_vector(8, 16);
-static const LLT V10S16 = LLT::fixed_vector(10, 16);
-static const LLT V12S16 = LLT::fixed_vector(12, 16);
-static const LLT V16S16 = LLT::fixed_vector(16, 16);
-
-static const LLT V2F16 = LLT::fixed_vector(2, LLT::integer(16));
-static const LLT V2BF16 = V2F16; // FIXME
-
-static const LLT V2S32 = LLT::fixed_vector(2, 32);
-static const LLT V3S32 = LLT::fixed_vector(3, 32);
-static const LLT V4S32 = LLT::fixed_vector(4, 32);
-static const LLT V5S32 = LLT::fixed_vector(5, 32);
-static const LLT V6S32 = LLT::fixed_vector(6, 32);
-static const LLT V7S32 = LLT::fixed_vector(7, 32);
-static const LLT V8S32 = LLT::fixed_vector(8, 32);
-static const LLT V9S32 = LLT::fixed_vector(9, 32);
-static const LLT V10S32 = LLT::fixed_vector(10, 32);
-static const LLT V11S32 = LLT::fixed_vector(11, 32);
-static const LLT V12S32 = LLT::fixed_vector(12, 32);
-static const LLT V16S32 = LLT::fixed_vector(16, 32);
-static const LLT V32S32 = LLT::fixed_vector(32, 32);
-
-static const LLT V2S64 = LLT::fixed_vector(2, 64);
-static const LLT V3S64 = LLT::fixed_vector(3, 64);
-static const LLT V4S64 = LLT::fixed_vector(4, 64);
-static const LLT V5S64 = LLT::fixed_vector(5, 64);
-static const LLT V6S64 = LLT::fixed_vector(6, 64);
-static const LLT V7S64 = LLT::fixed_vector(7, 64);
-static const LLT V8S64 = LLT::fixed_vector(8, 64);
-static const LLT V16S64 = LLT::fixed_vector(16, 64);
-
-static const LLT V2S128 = LLT::fixed_vector(2, 128);
-static const LLT V4S128 = LLT::fixed_vector(4, 128);
-
-static std::initializer_list<LLT> AllScalarTypes = {
- S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
-
-static std::initializer_list<LLT> AllS16Vectors{
- V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
-
-static std::initializer_list<LLT> AllS32Vectors = {
- V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
- V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
-
-static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
- V6S64, V7S64, V8S64, V16S64};
-
// Checks whether a type is in the list of legal register types.
static bool isRegisterClassType(LLT Ty) {
if (Ty.isPointerOrPointerVector())
- Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
+ Ty = Ty.changeElementType(LLT::integer(Ty.getScalarSizeInBits()));
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
@@ -598,14 +665,13 @@ static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
if (!PointerTy.isVector()) {
// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
- const LLT S32 = LLT::scalar(32);
Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
std::array<Register, 4> VectorElems;
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
for (unsigned I = 0; I < NumParts; ++I)
VectorElems[I] =
- B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
+ B.buildExtractVectorElementConstant(I32, VectorReg, I).getReg(0);
B.buildMergeValues(MO, VectorElems);
MO.setReg(VectorReg);
return VectorTy;
@@ -634,7 +700,7 @@ static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
SmallVector<Register, 4> PointerParts;
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
- auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
+ auto Unmerged = B.buildUnmerge(I32, Pointer);
for (unsigned I = 0; I < NumParts; ++I)
PointerParts.push_back(Unmerged.getReg(I));
return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
@@ -688,35 +754,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
const std::initializer_list<LLT> FPTypesBase = {
- S32, S64
+ F32, F64
};
const std::initializer_list<LLT> FPTypes16 = {
- S32, S64, S16
+ F32, F64, F16, BF16
};
const std::initializer_list<LLT> FPTypesPK16 = {
- S32, S64, S16, V2S16
+ F32, F64, F16, BF16, V2F16, V2BF16
};
- const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
+ const LLT MinScalarFPTy = ST.has16BitInsts() ? F16 : F32;
// s1 for VCC branches, s32 for SCC branches.
- getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({I1, I32});
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
- .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
+ .legalFor({I32, F32, I64, F64, V2I16, V2F16, V2BF16, I16, F16, BF16, V4I16, V4F16, V4BF16, I1, I128, I256})
.legalFor(AllS32Vectors)
.legalFor(AllS64Vectors)
.legalFor(AddrSpaces64)
.legalFor(AddrSpaces32)
.legalFor(AddrSpaces128)
.legalIf(isPointer(0))
- .clampScalar(0, S16, S256)
+ .clampScalar(0, I16, I256)
.widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16)
+ .clampMaxNumElements(0, I32, 16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.scalarize(0);
@@ -724,60 +790,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Full set of gfx9 features.
if (ST.hasScalarAddSub64()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S64, S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I64, I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ .maxScalar(0, I32);
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ .maxScalar(0, I32);
}
if (ST.hasScalarSMulU64()) {
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S64, S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I64, I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
} else {
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
}
assert(ST.hasMad64_32());
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
- .legalFor({S32, S16, V2S16}) // Clamp modifier
- .minScalarOrElt(0, S16)
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16}) // Clamp modifier
+ .minScalarOrElt(0, I16)
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.lower();
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16})
- .minScalar(0, S16)
+ .legalFor({I32, I16})
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32)
+ .maxScalar(0, I32)
.scalarize(0);
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16})
+ .legalFor({I32, I16})
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
assert(ST.hasMad64_32());
@@ -785,8 +851,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Technically the saturating operations require clamp bit support, but this
// was introduced at the same time as 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .legalFor({S32, S16}) // Clamp modifier
- .minScalar(0, S16)
+ .legalFor({I32, I16}) // Clamp modifier
+ .minScalar(0, I16)
.scalarize(0)
.widenScalarToNextPow2(0, 16)
.lower();
@@ -794,37 +860,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// We're just lowering this, but it helps get a better result to try to
// coerce to the desired type first.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32})
+ .legalFor({I32})
.widenScalarToNextMultipleOf(0, 32)
- .clampScalar(0, S32, S32)
+ .clampScalar(0, I32, I32)
.scalarize(0);
auto &Mul = getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32})
+ .legalFor({I32})
.scalarize(0)
- .minScalar(0, S32)
+ .minScalar(0, I32)
.widenScalarToNextMultipleOf(0, 32);
if (ST.hasMad64_32())
Mul.custom();
else
- Mul.maxScalar(0, S32);
+ Mul.maxScalar(0, I32);
if (ST.hasIntClamp()) {
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .legalFor({S32}) // Clamp modifier.
+ .legalFor({I32}) // Clamp modifier.
.scalarize(0)
- .minScalarOrElt(0, S32)
+ .minScalarOrElt(0, I32)
.lower();
} else {
// Clamp bit support was added in VI, along with 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
@@ -832,26 +898,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: DAG expansion gets better results. The widening uses the smaller
// range values and goes for the min/max lowering directly.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
getActionDefinitionsBuilder(
{G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
- .customFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .customFor({I32, I64})
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
- .legalFor({S32})
- .maxScalar(0, S32);
+ .legalFor({I32})
+ .maxScalar(0, I32);
if (ST.hasVOP3PInsts()) {
Mulh
- .clampMaxNumElements(0, S8, 2)
- .lowerFor({V2S8});
+ .clampMaxNumElements(0, I8, 2)
+ .lowerFor({V2I8});
}
Mulh
@@ -861,8 +927,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Report legal for any types we can handle anywhere. For the cases only legal
// on the SALU, RegBankSelect will be able to re-legalize.
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
- .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
- .clampScalar(0, S32, S64)
+ .legalFor({I32, I1, I64, V2I32, I16, V2I16, V4I16})
+ .clampScalar(0, I32, I64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
.widenScalarToNextPow2(0)
@@ -870,8 +936,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(
{G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}, {S32, S32}})
- .clampScalar(0, S32, S32)
+ .legalFor({{I32, I1}, {I32, I32}})
+ .clampScalar(0, I32, I32)
.scalarize(0);
getActionDefinitionsBuilder(G_BITCAST)
@@ -880,40 +946,42 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64, S16, GlobalPtr,
+ .legalFor({I1, I32, I64, I16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
.legalIf(isPointer(0))
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64, S16})
- .clampScalar(0, S16, S64);
+ .legalFor({F32, F64, F16, BF16})
+ .clampScalar(0, F16, F64);
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
.legalIf(isRegisterClassType(0))
// s1 and s16 are special cases because they have legal operations on
// them, but don't really occupy registers in the normal way.
- .legalFor({S1, S16})
- .clampNumElements(0, V16S32, V32S32)
+ .legalFor({I1, I16, F16, BF16})
+ .clampNumElements(0, V16I32, V32I32)
+ .clampNumElements(0, V16F32, V32F32)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampScalarOrElt(0, S32, MaxScalar)
+ .clampScalarOrElt(0, I32, MaxScalar)
.widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16);
+ .clampMaxNumElements(0, I32, 16)
+ .clampMaxNumElements(0, F32, 16);
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
- .legalFor({{PrivatePtr, S32}});
+ .legalFor({{PrivatePtr, I32}, {PrivatePtr, F32}});
getActionDefinitionsBuilder(G_STACKSAVE)
.customFor({PrivatePtr});
getActionDefinitionsBuilder(G_STACKRESTORE)
.legalFor({PrivatePtr});
- getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
+ getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({I64});
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
@@ -923,25 +991,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
- .legalFor({S32, S64});
+ .legalFor({F32, F64});
auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
- .customFor({S32, S64});
+ .customFor({F32, F64});
auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
- .customFor({S32, S64});
+ .customFor({F32, F64});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts())
- FPOpActions.legalFor({S16, V2S16});
+ FPOpActions.legalFor({F16, V2F16});
else
- FPOpActions.legalFor({S16});
+ FPOpActions.legalFor({F16});
- TrigActions.customFor({S16});
- FDIVActions.customFor({S16});
+ TrigActions.customFor({F16});
+ FDIVActions.customFor({F16});
}
if (ST.hasPackedFP32Ops()) {
- FPOpActions.legalFor({V2S32});
- FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
+ FPOpActions.legalFor({V2F32});
+ FPOpActions.clampMaxNumElementsStrict(0, F32, 2);
}
auto &MinNumMaxNum = getActionDefinitionsBuilder({
@@ -950,154 +1018,154 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampMaxNumElements(0, S16, 2)
- .clampScalar(0, S16, S64)
+ .clampMaxNumElements(0, F16, 2)
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else if (ST.has16BitInsts()) {
MinNumMaxNum.customFor(FPTypes16)
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else {
MinNumMaxNum.customFor(FPTypesBase)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, F32, F64)
.scalarize(0);
}
if (ST.hasVOP3PInsts())
- FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
-
+ FPOpActions.clampMaxNumElementsStrict(0, F16, 2);
+
FPOpActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
TrigActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
FDIVActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
getActionDefinitionsBuilder({G_FNEG, G_FABS})
.legalFor(FPTypesPK16)
- .clampMaxNumElementsStrict(0, S16, 2)
+ .clampMaxNumElementsStrict(0, F16, 2)
.scalarize(0)
- .clampScalar(0, S16, S64);
+ .clampScalar(0, F16, F64);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S16})
- .customFor({S32, S64})
+ .legalFor({F16})
+ .customFor({F32, F64})
.scalarize(0)
.unsupported();
getActionDefinitionsBuilder(G_FFLOOR)
- .legalFor({S32, S64, S16})
+ .legalFor({F32, F64, F16})
.scalarize(0)
- .clampScalar(0, S16, S64);
+ .clampScalar(0, F16, F64);
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
- .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
+ .legalFor({{F32, I32}, {F64, I32}, {F16, I16}})
.scalarize(0)
- .maxScalarIf(typeIs(0, S16), 1, S16)
- .clampScalar(1, S32, S32)
+ .maxScalarIf(typeIs(0, F16), 1, I16)
+ .clampScalar(1, I32, I32)
.lower();
getActionDefinitionsBuilder(G_FFREXP)
- .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
+ .customFor({{F32, F32}, {F64, F32}, {F16, F16}, {F16, F32}})
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
- .customFor({S32, S64, S16})
+ .customFor({F32, F64, F16})
.scalarize(0)
.unsupported();
if (ST.hasFractBug()) {
getActionDefinitionsBuilder(G_FFLOOR)
- .customFor({S64})
- .legalFor({S32, S64})
+ .customFor({F64})
+ .legalFor({F32, F64})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
} else {
getActionDefinitionsBuilder(G_FFLOOR)
- .legalFor({S32, S64})
+ .legalFor({F32, F64})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
}
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
- .legalFor({{S32, S32}, {S64, S32}})
+ .legalFor({{F32, I32}, {F64, I32}})
.scalarize(0)
- .clampScalar(0, S32, S64)
- .clampScalar(1, S32, S32)
+ .clampScalar(0, F32, F64)
+ .clampScalar(1, I32, I32)
.lower();
getActionDefinitionsBuilder(G_FFREXP)
- .customFor({{S32, S32}, {S64, S32}})
+ .customFor({{F32, F32}, {F64, F32}})
.scalarize(0)
- .minScalar(0, S32)
- .clampScalar(1, S32, S32)
+ .minScalar(0, I32)
+ .clampScalar(1, I32, I32)
.lower();
}
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
if (ST.hasCvtPkF16F32Inst())
FPTruncActions.legalFor(
- {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
+ {{F32, F64}, {F16, F32}, {V2F16, V2F32}, {V2F16, V2F64}});
else
- FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
+ FPTruncActions.legalFor({{F32, F64}, {F16, F32}});
FPTruncActions.scalarize(0).lower();
getActionDefinitionsBuilder(G_FPEXT)
- .legalFor({{S64, S32}, {S32, S16}})
- .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
+ .legalFor({{F64, F32}, {F32, F16}})
+ .narrowScalarFor({{I64, I16}}, changeTo(0, I32))
.scalarize(0);
auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
if (ST.has16BitInsts()) {
FSubActions
// Use actual fsub instruction
- .legalFor({S32, S16})
+ .legalFor({F32, F16})
// Must use fadd + fneg
- .lowerFor({S64, V2S16});
+ .lowerFor({F64, V2F16});
} else {
FSubActions
// Use actual fsub instruction
- .legalFor({S32})
+ .legalFor({F32})
// Must use fadd + fneg
- .lowerFor({S64, S16, V2S16});
+ .lowerFor({F64, F16, V2F16});
}
FSubActions
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
// Whether this is legal depends on the floating point mode for the function.
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
if (ST.hasMadF16() && ST.hasMadMacF32Insts())
- FMad.customFor({S32, S16});
+ FMad.customFor({F32, F16});
else if (ST.hasMadMacF32Insts())
- FMad.customFor({S32});
+ FMad.customFor({F32});
else if (ST.hasMadF16())
- FMad.customFor({S16});
+ FMad.customFor({F16});
FMad.scalarize(0)
.lower();
auto &FRem = getActionDefinitionsBuilder(G_FREM);
if (ST.has16BitInsts()) {
- FRem.customFor({S16, S32, S64});
+ FRem.customFor({F16, F32, F64});
} else {
- FRem.minScalar(0, S32)
- .customFor({S32, S64});
+ FRem.minScalar(0, F32)
+ .customFor({F32, F64});
}
FRem.scalarize(0);
// TODO: Do we need to clamp maximum bitwidth?
getActionDefinitionsBuilder(G_TRUNC)
.legalIf(isScalar(0))
- .legalFor({{V2S16, V2S32}})
- .clampMaxNumElements(0, S16, 2)
+ .legalFor({{V2F16, V2F32}})
+ .clampMaxNumElements(0, F16, 2)
// Avoid scalarizing in cases that should be truly illegal. In unresolvable
// situations (like an invalid implicit use), we don't want to infinite loop
// in the legalizer.
@@ -1105,45 +1173,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.alwaysLegal();
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
- .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
- {S32, S1}, {S64, S1}, {S16, S1}})
+ .legalFor({{I64, I32}, {I32, I16}, {I64, I16},
+ {I32, I1}, {I64, I1}, {I16, I1}})
.scalarize(0)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(1, 32);
// TODO: Split s1->s64 during regbankselect for VALU.
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
- .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
- .lowerIf(typeIs(1, S1))
- .customFor({{S32, S64}, {S64, S64}});
+ .legalFor({{F32, I32}, {F64, I32}, {F16, I32}})
+ .lowerIf(typeIs(1, I1))
+ .customFor({{F32, I64}, {F64, I64}});
if (ST.has16BitInsts())
- IToFP.legalFor({{S16, S16}});
- IToFP.clampScalar(1, S32, S64)
- .minScalar(0, S32)
+ IToFP.legalFor({{F16, I16}});
+ IToFP.clampScalar(1, I32, I64)
+ .minScalar(0, I32)
.scalarize(0)
.widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S32}, {S64, S64}})
- .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
+ .legalFor({{I32, F32}, {I32, F64}, {I32, F16}})
+ .customFor({{I64, F32}, {I64, F64}})
+ .narrowScalarFor({{I64, I16}}, changeTo(0, I32));
if (ST.has16BitInsts())
- FPToI.legalFor({{S16, S16}});
+ FPToI.legalFor({{I16, F16}});
else
- FPToI.minScalar(1, S32);
+ FPToI.minScalar(1, I32);
- FPToI.minScalar(0, S32)
+ FPToI.minScalar(0, I32)
.widenScalarToNextPow2(0, 32)
.scalarize(0)
.lower();
getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0)
.lower();
getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
- .legalFor({S16, S32})
+ .legalFor({F16, F32})
.scalarize(0)
.lower();
@@ -1153,28 +1221,28 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0)
.lower();
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S16, S32, S64})
- .clampScalar(0, S16, S64)
+ .legalFor({F16, F32, F64})
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .legalFor({F32, F64})
+ .clampScalar(0, F32, F64)
.scalarize(0);
} else {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S32})
- .customFor({S64})
- .clampScalar(0, S32, S64)
+ .legalFor({F32})
+ .customFor({F64})
+ .clampScalar(0, F32, F64)
.scalarize(0);
}
@@ -1185,7 +1253,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
- .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
+ .legalIf(all(sameSize(0, 1), typeInSet(1, {I64, I32})))
.scalarSameSizeAs(1, 0)
.scalarize(0);
@@ -1202,79 +1270,79 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
// bank.
.legalForCartesianProduct(
- {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
+ {I1}, {I32, I64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
.legalForCartesianProduct(
- {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
+ {I32}, {I32, I64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
if (ST.has16BitInsts()) {
- CmpBuilder.legalFor({{S1, S16}});
+ CmpBuilder.legalFor({{I1, I16}});
}
CmpBuilder
.widenScalarToNextPow2(1)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, I32, I64)
.scalarize(0)
- .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
+ .legalIf(all(typeInSet(0, {I1, I32}), isPointer(1)));
auto &FCmpBuilder =
getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
- {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
+ {I1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
if (ST.hasSALUFloatInsts())
- FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
+ FCmpBuilder.legalForCartesianProduct({I32}, {F16, F32});
FCmpBuilder
.widenScalarToNextPow2(1)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, F32, F64)
.scalarize(0);
// FIXME: fpow has a selection pattern that should move to custom lowering.
auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
if (ST.has16BitInsts())
- ExpOps.customFor({{S32}, {S16}});
+ ExpOps.customFor({{F32}, {F16}});
else
- ExpOps.customFor({S32});
- ExpOps.clampScalar(0, MinScalarFPTy, S32)
+ ExpOps.customFor({F32});
+ ExpOps.clampScalar(0, MinScalarFPTy, F32)
.scalarize(0);
getActionDefinitionsBuilder(G_FPOWI)
- .clampScalar(0, MinScalarFPTy, S32)
+ .clampScalar(0, MinScalarFPTy, F32)
.lower();
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
- Log2Ops.customFor({S32});
+ Log2Ops.customFor({F32});
if (ST.has16BitInsts())
- Log2Ops.legalFor({S16});
+ Log2Ops.legalFor({F16});
else
- Log2Ops.customFor({S16});
+ Log2Ops.customFor({F16});
Log2Ops.scalarize(0)
.lower();
auto &LogOps =
getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
- LogOps.customFor({S32, S16});
- LogOps.clampScalar(0, MinScalarFPTy, S32)
+ LogOps.customFor({F32, F16});
+ LogOps.clampScalar(0, MinScalarFPTy, F32)
.scalarize(0);
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
+ .legalFor({{I32, I32}, {I32, I64}})
+ .clampScalar(0, I32, I32)
.widenScalarToNextPow2(1, 32)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32);
// If no 16 bit instr is available, lower into different instructions.
if (ST.has16BitInsts())
getActionDefinitionsBuilder(G_IS_FPCLASS)
- .legalForCartesianProduct({S1}, FPTypes16)
+ .legalForCartesianProduct({I1}, FPTypes16)
.widenScalarToNextPow2(1)
.scalarize(0)
.lower();
else
getActionDefinitionsBuilder(G_IS_FPCLASS)
- .legalForCartesianProduct({S1}, FPTypesBase)
- .lowerFor({S1, S16})
+ .legalForCartesianProduct({I1}, FPTypesBase)
+ .lowerFor({I1, I16})
.widenScalarToNextPow2(1)
.scalarize(0)
.lower();
@@ -1284,26 +1352,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// bitwidth.
getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
.scalarize(0)
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32)
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
+ .legalFor({{I32, I32}, {I32, I64}})
.customIf(scalarNarrowerThan(1, 32))
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .legalFor({{I32, I32}, {I32, I64}})
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
@@ -1311,52 +1379,52 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .legalFor({I32, I64})
+ .clampScalar(0, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
- .legalFor({S16, S32, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I16, I32, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
- .clampScalar(0, S16, S32)
+ .clampScalar(0, I16, I32)
.scalarize(0);
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElements(0, I16, 2)
+ .minScalar(0, I16)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16})
+ .legalFor({I32, I16})
.widenScalarToNextPow2(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
}
} else {
// TODO: Should have same legality without v_perm_b32
getActionDefinitionsBuilder(G_BSWAP)
- .legalFor({S32})
+ .legalFor({I32})
.lowerIf(scalarNarrowerThan(0, 32))
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
- .maxScalar(0, S32)
+ .maxScalar(0, I32)
.scalarize(0)
.lower();
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32})
- .minScalar(0, S32)
+ .legalFor({I32})
+ .minScalar(0, I32)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
@@ -1364,8 +1432,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_INTTOPTR)
// List the common cases
- .legalForCartesianProduct(AddrSpaces64, {S64})
- .legalForCartesianProduct(AddrSpaces32, {S32})
+ .legalForCartesianProduct(AddrSpaces64, {I64})
+ .legalForCartesianProduct(AddrSpaces32, {I32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
@@ -1380,18 +1448,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_PTRTOINT)
// List the common cases
- .legalForCartesianProduct(AddrSpaces64, {S64})
- .legalForCartesianProduct(AddrSpaces32, {S32})
+ .legalForCartesianProduct(AddrSpaces64, {I64})
+ .legalForCartesianProduct(AddrSpaces32, {I32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
.widenScalarIf(smallerThan(0, 1),
[](const LegalityQuery &Query) {
return std::pair(
- 0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ 0, LLT::integer(Query.Types[1].getSizeInBits()));
})
.narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
- return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ return std::pair(0, LLT::integer(Query.Types[1].getSizeInBits()));
});
getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
@@ -1444,32 +1512,50 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &Actions = getActionDefinitionsBuilder(Op);
// Explicitly list some common cases.
// TODO: Does this help compile time at all?
- Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
- {V2S32, GlobalPtr, V2S32, GlobalAlign32},
- {V4S32, GlobalPtr, V4S32, GlobalAlign32},
- {S64, GlobalPtr, S64, GlobalAlign32},
- {V2S64, GlobalPtr, V2S64, GlobalAlign32},
- {V2S16, GlobalPtr, V2S16, GlobalAlign32},
- {S32, GlobalPtr, S8, GlobalAlign8},
- {S32, GlobalPtr, S16, GlobalAlign16},
-
- {S32, LocalPtr, S32, 32},
- {S64, LocalPtr, S64, 32},
- {V2S32, LocalPtr, V2S32, 32},
- {S32, LocalPtr, S8, 8},
- {S32, LocalPtr, S16, 16},
- {V2S16, LocalPtr, S32, 32},
-
- {S32, PrivatePtr, S32, 32},
- {S32, PrivatePtr, S8, 8},
- {S32, PrivatePtr, S16, 16},
- {V2S16, PrivatePtr, S32, 32},
-
- {S32, ConstantPtr, S32, GlobalAlign32},
- {V2S32, ConstantPtr, V2S32, GlobalAlign32},
- {V4S32, ConstantPtr, V4S32, GlobalAlign32},
- {S64, ConstantPtr, S64, GlobalAlign32},
- {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
+ Actions.legalForTypesWithMemDesc({{I32, GlobalPtr, I32, GlobalAlign32},
+ {F32, GlobalPtr, F32, GlobalAlign32},
+
+ {V2I32, GlobalPtr, V2I32, GlobalAlign32},
+ {V2F32, GlobalPtr, V2F32, GlobalAlign32},
+
+ {V4I32, GlobalPtr, V4I32, GlobalAlign32},
+ {V4F32, GlobalPtr, V4F32, GlobalAlign32},
+
+ {I64, GlobalPtr, I64, GlobalAlign32},
+ {F64, GlobalPtr, F64, GlobalAlign32},
+
+ {V2I64, GlobalPtr, V2I64, GlobalAlign32},
+ {V2F64, GlobalPtr, V2F64, GlobalAlign32},
+ {V2I16, GlobalPtr, V2I16, GlobalAlign32},
+ {V2F16, GlobalPtr, V2F16, GlobalAlign32},
+ {V2BF16, GlobalPtr, V2BF16, GlobalAlign32},
+
+ {I32, GlobalPtr, I8, GlobalAlign8},
+ {I32, GlobalPtr, I16, GlobalAlign16},
+
+ {I32, LocalPtr, I32, 32},
+ {F32, LocalPtr, F32, 32},
+ {I64, LocalPtr, I64, 32},
+ {F64, LocalPtr, F64, 32},
+ {V2I32, LocalPtr, V2I32, 32},
+ {V2F32, LocalPtr, V2F32, 32},
+ {I32, LocalPtr, I8, 8},
+ {I32, LocalPtr, I16, 16},
+ {V2I16, LocalPtr, I32, 32},
+
+ {I32, PrivatePtr, I32, 32},
+ {F32, PrivatePtr, F32, 32},
+ {I32, PrivatePtr, I8, 8},
+ {I32, PrivatePtr, I16, 16},
+ {V2I16, PrivatePtr, I32, 32},
+
+ {I32, ConstantPtr, I32, GlobalAlign32},
+ {F32, ConstantPtr, F32, GlobalAlign32},
+ {V2I32, ConstantPtr, V2I32, GlobalAlign32},
+ {V4I32, ConstantPtr, V4I32, GlobalAlign32},
+ {I64, ConstantPtr, I64, GlobalAlign32},
+ {F64, ConstantPtr, F64, GlobalAlign32},
+ {V2I32, ConstantPtr, V2I32, GlobalAlign32}});
Actions.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
@@ -1531,16 +1617,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Split extloads.
if (DstSize > MemSize)
- return std::pair(0, LLT::scalar(MemSize));
+ return std::pair(0, LLT::integer(MemSize));
unsigned MaxSize = maxSizeForAddrSpace(
ST, PtrTy.getAddressSpace(), Op == G_LOAD,
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
if (MemSize > MaxSize)
- return std::pair(0, LLT::scalar(MaxSize));
+ return std::pair(0, LLT::integer(MaxSize));
uint64_t Align = Query.MMODescrs[0].AlignInBits;
- return std::pair(0, LLT::scalar(Align));
+ return std::pair(0, LLT::integer(Align));
})
.fewerElementsIf(
[=](const LegalityQuery &Query) -> bool {
@@ -1603,8 +1689,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::pair(0, EltTy);
})
- .minScalar(0, S32)
- .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
+ .minScalar(0, I32)
+ .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, I32))
.widenScalarToNextPow2(0)
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
.lower();
@@ -1612,14 +1698,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Unaligned accesses not lowered.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
- {S32, GlobalPtr, S16, 2 * 8},
- {S32, LocalPtr, S8, 8},
- {S32, LocalPtr, S16, 16},
- {S32, PrivatePtr, S8, 8},
- {S32, PrivatePtr, S16, 16},
- {S32, ConstantPtr, S8, 8},
- {S32, ConstantPtr, S16, 2 * 8}})
+ .legalForTypesWithMemDesc({{I32, GlobalPtr, I8, 8},
+ {I32, GlobalPtr, I16, 2 * 8},
+ {I32, LocalPtr, I8, 8},
+ {I32, LocalPtr, I16, 16},
+ {I32, PrivatePtr, I8, 8},
+ {I32, PrivatePtr, I16, 16},
+ {I32, ConstantPtr, I8, 8},
+ {I32, ConstantPtr, I16, 2 * 8}})
.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
@@ -1627,7 +1713,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAddressSpace()) {
ExtLoads.legalForTypesWithMemDesc(
- {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
+ {{I32, FlatPtr, I8, 8}, {I32, FlatPtr, I16, 16}});
}
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
@@ -1637,7 +1723,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// inserting addrspacecasts.
ExtLoads.customIf(typeIs(1, Constant32Ptr));
- ExtLoads.clampScalar(0, S32, S32)
+ ExtLoads.clampScalar(0, I32, I32)
.widenScalarToNextPow2(0)
.lower();
@@ -1646,35 +1732,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
- .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
- {S64, GlobalPtr}, {S64, LocalPtr},
- {S32, RegionPtr}, {S64, RegionPtr}});
+ .legalFor({{I32, GlobalPtr}, {I32, LocalPtr},
+ {I64, GlobalPtr}, {I64, LocalPtr},
+ {I32, RegionPtr}, {I64, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
- Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
+ Atomics.legalFor({{I32, FlatPtr}, {I64, FlatPtr}});
}
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
- Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ Atomic.legalFor({{F32, LocalPtr}, {F32, RegionPtr}});
if (ST.hasLdsAtomicAddF64())
- Atomic.legalFor({{S64, LocalPtr}});
+ Atomic.legalFor({{F64, LocalPtr}});
if (ST.hasAtomicDsPkAdd16Insts())
Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
- Atomic.legalFor({{S32, GlobalPtr}});
+ Atomic.legalFor({{F32, GlobalPtr}});
if (ST.hasFlatAtomicFaddF32Inst())
- Atomic.legalFor({{S32, FlatPtr}});
+ Atomic.legalFor({{F32, FlatPtr}});
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
Atomic.legalFor({
- {S32, GlobalPtr},
- {S64, GlobalPtr},
- {S64, FlatPtr}
+ {F32, GlobalPtr},
+ {F64, GlobalPtr},
+ {F64, FlatPtr}
});
}
@@ -1705,40 +1791,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
- .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
- {S32, FlatPtr}, {S64, FlatPtr}})
- .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
- {S32, RegionPtr}, {S64, RegionPtr}});
+ .customFor({{I32, GlobalPtr}, {I64, GlobalPtr},
+ {I32, FlatPtr}, {I64, FlatPtr}})
+ .legalFor({{I32, LocalPtr}, {I64, LocalPtr},
+ {I32, RegionPtr}, {I64, RegionPtr}});
// TODO: Pointer types, any 32-bit or 64-bit vector
// Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
- .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
+ .legalForCartesianProduct({I16, F16, BF16, I32, F32, I64, F64, V2I32, V2F32, V2I16, V2F16, V2BF16, V4I16, V4F16, V4BF16, GlobalPtr,
LocalPtr, FlatPtr, PrivatePtr,
LLT::fixed_vector(2, LocalPtr),
LLT::fixed_vector(2, PrivatePtr)},
- {S1, S32})
- .clampScalar(0, S16, S64)
+ {I1, I32})
+ .clampScalar(0, I16, I64)
.scalarize(1)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
- .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, I32, 2)
.clampMaxNumElements(0, LocalPtr, 2)
.clampMaxNumElements(0, PrivatePtr, 2)
.scalarize(0)
.widenScalarToNextPow2(0)
- .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
+ .legalIf(all(isPointer(0), typeInSet(1, {I1, I32})));
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
// be more flexible with the shift amount type.
auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
- .legalFor({{S32, S32}, {S64, S32}});
+ .legalFor({{I32, I32}, {I64, I32}});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts()) {
- Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
- .clampMaxNumElements(0, S16, 2);
+ Shifts.legalFor({{I16, I16}, {V2I16, V2I16}})
+ .clampMaxNumElements(0, I16, 2);
} else
- Shifts.legalFor({{S16, S16}});
+ Shifts.legalFor({{I16, I16}});
// TODO: Support 16-bit shift amounts for all types
Shifts.widenScalarIf(
@@ -1749,26 +1835,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT AmountTy = Query.Types[1];
return ValTy.getSizeInBits() <= 16 &&
AmountTy.getSizeInBits() < 16;
- }, changeTo(1, S16));
- Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
- Shifts.clampScalar(1, S32, S32);
+ }, changeTo(1, I16));
+ Shifts.maxScalarIf(typeIs(0, I16), 1, I16);
+ Shifts.clampScalar(1, I32, I32);
Shifts.widenScalarToNextPow2(0, 16);
- Shifts.clampScalar(0, S16, S64);
+ Shifts.clampScalar(0, I16, I64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
} else {
// Make sure we legalize the shift amount type first, as the general
// expansion for the shifted type will produce much worse code if it hasn't
// been truncated already.
- Shifts.clampScalar(1, S32, S32);
+ Shifts.clampScalar(1, I32, I32);
Shifts.widenScalarToNextPow2(0, 32);
- Shifts.clampScalar(0, S32, S64);
+ Shifts.clampScalar(0, I32, I64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
@@ -1820,10 +1906,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
VecTypeIdx,
LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
})
- .clampScalar(EltTypeIdx, S32, S64)
- .clampScalar(VecTypeIdx, S32, S64)
- .clampScalar(IdxTypeIdx, S32, S32)
- .clampMaxNumElements(VecTypeIdx, S32, 32)
+ .clampScalar(EltTypeIdx, I32, I64)
+ .clampScalar(VecTypeIdx, I32, I64)
+ .clampScalar(IdxTypeIdx, I32, I32)
+ .clampMaxNumElements(VecTypeIdx, I32, 32)
+ .clampMaxNumElements(VecTypeIdx, F32, 32)
// TODO: Clamp elements for 64-bit vectors?
.moreElementsIf(
isIllegalRegisterType(VecTypeIdx),
@@ -1845,7 +1932,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
- .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+ .lowerIf(all(typeIs(LitTyIdx, I16), sizeIs(BigTyIdx, 32)))
.lowerIf([=](const LegalityQuery &Query) {
// Sub-vector(or single element) insert and extract.
// TODO: verify immediate offset here since lower only works with
@@ -1878,11 +1965,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V32S32)
- .clampNumElements(0, V2S64, V16S64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+ .legalForCartesianProduct(AllS32Vectors, {I32})
+ .legalForCartesianProduct(AllS32Vectors, {F32})
+ .legalForCartesianProduct(AllS64Vectors, {I64})
+ .legalForCartesianProduct(AllS64Vectors, {F64})
+ .clampNumElements(0, V16I32, V32I32)
+ .clampNumElements(0, V2I64, V16I64)
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2I16))
.moreElementsIf(
isIllegalRegisterType(0),
moreElementsToNextExistingRegClass(0));
@@ -1890,18 +1979,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasScalarPackInsts()) {
BuildVector
// FIXME: Should probably widen s1 vectors straight to s32
- .minScalarOrElt(0, S16)
- .minScalar(1, S16);
+ .minScalarOrElt(0, I16)
+ .minScalar(1, I16);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
- .legalFor({V2S16, S32})
+ .legalFor({V2I16, I32})
.lower();
} else {
- BuildVector.customFor({V2S16, S16});
- BuildVector.minScalarOrElt(0, S32);
+ BuildVector.customFor({V2I16, I16});
+ BuildVector.minScalarOrElt(0, I32);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
- .customFor({V2S16, S32})
+ .customFor({V2I16, I32})
.lower();
}
@@ -1910,9 +1999,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
- .clampMaxNumElements(0, S32, 32)
- .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
- .clampMaxNumElements(0, S16, 64);
+ .clampMaxNumElements(0, I32, 32)
+ .clampMaxNumElements(1, I16, 2) // TODO: Make 4?
+ .clampMaxNumElements(0, I16, 64);
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
@@ -1935,23 +2024,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &Builder = getActionDefinitionsBuilder(Op)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
- .lowerFor({{S16, V2S16}})
+ .lowerFor({{I16, V2I16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
return BigTy.getSizeInBits() == 32;
})
// Try to widen to s16 first for small types.
// TODO: Only do this on targets with legal s16 shifts
- .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
+ .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, I16)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
- .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
- elementTypeIs(1, S16)),
- changeTo(1, V2S16))
+ .fewerElementsIf(all(typeIs(0, I16), vectorWiderThan(1, 32),
+ elementTypeIs(1, I16)),
+ changeTo(1, V2I16))
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
// worth considering the multiples of 64 since 2*192 and 2*384 are not
// valid.
- .clampScalar(LitTyIdx, S32, S512)
+ .clampScalar(LitTyIdx, I32, I512)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
// Break up vectors with weird elements into scalars
.fewerElementsIf(
@@ -1960,7 +2049,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
scalarize(1))
- .clampScalar(BigTyIdx, S32, MaxScalar);
+ .clampScalar(BigTyIdx, I32, MaxScalar);
if (Op == G_MERGE_VALUES) {
Builder.widenScalarIf(
@@ -1969,7 +2058,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT Ty = Query.Types[LitTyIdx];
return Ty.getSizeInBits() < 32;
},
- changeTo(LitTyIdx, S32));
+ changeTo(LitTyIdx, I32));
}
Builder.widenScalarIf(
@@ -1997,25 +2086,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
- .legalFor({{S32}, {S64}});
+ .legalFor({{I32}, {I64}});
if (ST.hasVOP3PInsts()) {
- SextInReg.lowerFor({{V2S16}})
+ SextInReg.lowerFor({{V2I16}})
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
// get more vector shift opportunities, since we'll get those when
// expanded.
- .clampMaxNumElementsStrict(0, S16, 2);
+ .clampMaxNumElementsStrict(0, I16, 2);
} else if (ST.has16BitInsts()) {
- SextInReg.lowerFor({{S32}, {S64}, {S16}});
+ SextInReg.lowerFor({{I32}, {I64}, {I16}});
} else {
// Prefer to promote to s32 before lowering if we don't have 16-bit
// shifts. This avoid a lot of intermediate truncate and extend operations.
- SextInReg.lowerFor({{S32}, {S64}});
+ SextInReg.lowerFor({{I32}, {I64}});
}
SextInReg
.scalarize(0)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.lower();
getActionDefinitionsBuilder({G_ROTR, G_ROTL})
@@ -2024,16 +2113,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
- .legalFor({{S32, S32}})
- .lowerFor({{V2S16, V2S16}})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({{I32, I32}})
+ .lowerFor({{V2I16, V2I16}})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.lower();
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_FSHL)
- .lowerFor({{V2S16, V2S16}})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .lowerFor({{V2I16, V2I16}})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.lower();
} else {
@@ -2043,22 +2132,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
- .legalFor({S64});
+ .legalFor({I64});
- getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
+ getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({I64});
getActionDefinitionsBuilder(G_FENCE)
.alwaysLegal();
getActionDefinitionsBuilder({G_SMULO, G_UMULO})
.scalarize(0)
- .minScalar(0, S32)
+ .minScalar(0, I32)
.lower();
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
- .legalFor({{S32, S32}, {S64, S32}})
- .clampScalar(1, S32, S32)
- .clampScalar(0, S32, S64)
+ .legalFor({{I32, I32}, {I64, I32}})
+ .clampScalar(1, I32, I32)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0)
.scalarize(0);
@@ -2075,7 +2164,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasIEEEMinMax()) {
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
.legalFor(FPTypesPK16)
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElements(0, I16, 2)
.scalarize(0);
} else {
// TODO: Implement
@@ -2208,8 +2297,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
@@ -2227,10 +2314,10 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// Register TableGen definitions would need an overhaul to get rid of the
// artificial "HI" aperture registers and prevent this kind of issue from
// happening.
- Register Dst = MRI.createGenericVirtualRegister(S64);
+ Register Dst = MRI.createGenericVirtualRegister(I64);
MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
- return B.buildUnmerge(S32, Dst).getReg(1);
+ return B.buildUnmerge(I32, Dst).getReg(1);
}
// TODO: can we be smarter about machine pointer info?
@@ -2258,13 +2345,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(32), commonAlignment(Align(64), Offset));
+ I32, commonAlignment(Align(64), Offset));
// Pointer address
B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildConstant(I64, Offset).getReg(0));
// Load address
- return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+ return B.buildLoad(I32, LoadAddr, *MMO).getReg(0);
}
Register QueuePtr = MRI.createGenericVirtualRegister(
@@ -2281,11 +2368,11 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(32), commonAlignment(Align(64), StructOffset));
+ I32, commonAlignment(Align(64), StructOffset));
B.buildPtrAdd(LoadAddr, QueuePtr,
- B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
- return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+ B.buildConstant(I64, StructOffset).getReg(0));
+ return B.buildLoad(I32, LoadAddr, *MMO).getReg(0);
}
/// Return true if the value is a known valid address, such that a null check is
@@ -2319,8 +2406,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
Intrinsic::amdgcn_addrspacecast_nonnull));
-
- const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
: MI.getOperand(1).getReg();
@@ -2362,7 +2447,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
auto CmpRes =
- B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, I1, Src, FlatNull.getReg(0));
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
@@ -2379,7 +2464,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// Coerce the type of the low half of the result so we can use
// merge_values.
- Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ Register SrcAsInt = B.buildPtrToInt(I32, Src).getReg(0);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
@@ -2399,7 +2484,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
- auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
+ auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, I1, Src,
SegmentNull.getReg(0));
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
@@ -2420,8 +2505,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
DstTy.getSizeInBits() == 64) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
- auto PtrLo = B.buildPtrToInt(S32, Src);
- auto HighAddr = B.buildConstant(S32, AddrHiVal);
+ auto PtrLo = B.buildPtrToInt(I32, Src);
+ auto HighAddr = B.buildConstant(I32, AddrHiVal);
B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
MI.eraseFromParent();
return true;
@@ -2466,25 +2551,21 @@ bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFceil(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
-
- const LLT S1 = LLT::scalar(1);
- const LLT S64 = LLT::scalar(64);
-
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src).isScalar(64));
+ assert(MRI.getType(Src).isFloat(64));
// result = trunc(src)
// if (src > 0.0 && src != result)
// result += 1.0
- auto Trunc = B.buildIntrinsicTrunc(S64, Src);
+ auto Trunc = B.buildIntrinsicTrunc(F64, Src);
- const auto Zero = B.buildFConstant(S64, 0.0);
- const auto One = B.buildFConstant(S64, 1.0);
- auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
- auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
- auto And = B.buildAnd(S1, Lt0, NeTrunc);
- auto Add = B.buildSelect(S64, And, One, Zero);
+ const auto Zero = B.buildFConstant(F64, 0.0);
+ const auto One = B.buildFConstant(F64, 1.0);
+ auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, I1, Src, Zero);
+ auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, I1, Src, Trunc);
+ auto And = B.buildAnd(I1, Lt0, NeTrunc);
+ auto Add = B.buildSelect(F64, And, One, Zero);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
@@ -2513,31 +2594,26 @@ static MachineInstrBuilder extractF64Exponent(Register Hi,
MachineIRBuilder &B) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
- LLT S32 = LLT::scalar(32);
- auto Const0 = B.buildConstant(S32, FractBits - 32);
- auto Const1 = B.buildConstant(S32, ExpBits);
+ auto Const0 = B.buildConstant(I32, FractBits - 32);
+ auto Const1 = B.buildConstant(I32, ExpBits);
- auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
+ auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {I32})
.addUse(Hi)
.addUse(Const0.getReg(0))
.addUse(Const1.getReg(0));
- return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
+ return B.buildSub(I32, ExpPart, B.buildConstant(I32, 1023));
}
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
-
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src).isScalar(64));
+ assert(MRI.getType(Src).isFloat(64));
// TODO: Should this use extract since the low half is unused?
- auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+ auto Unmerge = B.buildUnmerge({I32, I32}, Src);
Register Hi = Unmerge.getReg(1);
// Extract the upper half, since this is where we will find the sign and
@@ -2547,25 +2623,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
const unsigned FractBits = 52;
// Extract the sign bit.
- const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
- auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
+ const auto SignBitMask = B.buildConstant(I32, UINT32_C(1) << 31);
+ auto SignBit = B.buildAnd(I32, Hi, SignBitMask);
- const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
+ const auto FractMask = B.buildConstant(I64, (UINT64_C(1) << FractBits) - 1);
- const auto Zero32 = B.buildConstant(S32, 0);
+ const auto Zero32 = B.buildConstant(I32, 0);
// Extend back to 64-bits.
- auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
+ auto SignBit64 = B.buildMergeLikeInstr(I64, {Zero32, SignBit});
+ SignBit64 = B.buildBitcast(F64, SignBit64);
- auto Shr = B.buildAShr(S64, FractMask, Exp);
- auto Not = B.buildNot(S64, Shr);
- auto Tmp0 = B.buildAnd(S64, Src, Not);
- auto FiftyOne = B.buildConstant(S32, FractBits - 1);
+ auto Shr = B.buildAShr(I64, FractMask, Exp);
+ auto Not = B.buildNot(I64, Shr);
+ auto Tmp0 = B.buildBitcast(F64, B.buildAnd(I64, Src, Not));
+ auto FiftyOne = B.buildConstant(I32, FractBits - 1);
- auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
- auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
+ auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, I1, Exp, Zero32);
+ auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, I1, Exp, FiftyOne);
- auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
+ auto Tmp1 = B.buildSelect(F64, ExpLt0, SignBit64, Tmp0);
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
MI.eraseFromParent();
return true;
@@ -2578,20 +2655,17 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
+ assert(MRI.getType(Src).isInteger(64));
- assert(MRI.getType(Src).isScalar(64));
+ auto Unmerge = B.buildUnmerge({I32, I32}, Src);
+ auto ThirtyTwo = B.buildConstant(I32, 32);
- auto Unmerge = B.buildUnmerge({S32, S32}, Src);
- auto ThirtyTwo = B.buildConstant(S32, 32);
+ if (MRI.getType(Dst).isFloat(64)) {
+ auto CvtHi = Signed ? B.buildSITOFP(F64, Unmerge.getReg(1))
+ : B.buildUITOFP(F64, Unmerge.getReg(1));
- if (MRI.getType(Dst).isScalar(64)) {
- auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
- : B.buildUITOFP(S64, Unmerge.getReg(1));
-
- auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
- auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
+ auto CvtLo = B.buildUITOFP(F64, Unmerge.getReg(0));
+ auto LdExp = B.buildFLdexp(F64, CvtHi, ThirtyTwo);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(Dst, LdExp, CvtLo);
@@ -2599,28 +2673,28 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
return true;
}
- assert(MRI.getType(Dst).isScalar(32));
+ assert(MRI.getType(Dst).isFloat(32));
- auto One = B.buildConstant(S32, 1);
+ auto One = B.buildConstant(I32, 1);
MachineInstrBuilder ShAmt;
if (Signed) {
- auto ThirtyOne = B.buildConstant(S32, 31);
- auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
- auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
- auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
- auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
+ auto ThirtyOne = B.buildConstant(I32, 31);
+ auto X = B.buildXor(I32, Unmerge.getReg(0), Unmerge.getReg(1));
+ auto OppositeSign = B.buildAShr(I32, X, ThirtyOne);
+ auto MaxShAmt = B.buildAdd(I32, ThirtyTwo, OppositeSign);
+ auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {I32})
.addUse(Unmerge.getReg(1));
- auto LS2 = B.buildSub(S32, LS, One);
- ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
+ auto LS2 = B.buildSub(I32, LS, One);
+ ShAmt = B.buildUMin(I32, LS2, MaxShAmt);
} else
- ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
- auto Norm = B.buildShl(S64, Src, ShAmt);
- auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
- auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
- auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
- auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
- auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
+ ShAmt = B.buildCTLZ(I32, Unmerge.getReg(1));
+ auto Norm = B.buildShl(I64, Src, ShAmt);
+ auto Unmerge2 = B.buildUnmerge({I32, I32}, Norm);
+ auto Adjust = B.buildUMin(I32, One, Unmerge2.getReg(0));
+ auto Norm2 = B.buildOr(I32, Unmerge2.getReg(1), Adjust);
+ auto FVal = Signed ? B.buildSITOFP(F32, Norm2) : B.buildUITOFP(F32, Norm2);
+ auto Scale = B.buildSub(I32, ThirtyTwo, ShAmt);
B.buildFLdexp(Dst, FVal, Scale);
MI.eraseFromParent();
return true;
@@ -2636,11 +2710,8 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
-
const LLT SrcLT = MRI.getType(Src);
- assert((SrcLT.isScalar(32) || SrcLT.isScalar(64)) && MRI.getType(Dst).isScalar(64));
+ assert((SrcLT.isFloat(32) || SrcLT.isFloat(64)) && MRI.getType(Dst).isInteger(64));
unsigned Flags = MI.getFlags();
@@ -2661,35 +2732,35 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
// negative. To avoid the loss of precision, We need to take the absolute
// value after truncating and flip the result back based on the original
// signedness.
- Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
- Trunc = B.buildFAbs(S32, Trunc, Flags);
+ Sign = B.buildAShr(I32, Src, B.buildConstant(I32, 31));
+ Trunc = B.buildFAbs(F32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
- if (SrcLT.isScalar(64)) {
+ if (SrcLT.isFloat(64)) {
K0 = B.buildFConstant(
- S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
+ F64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
K1 = B.buildFConstant(
- S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
+ F64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
} else {
K0 = B.buildFConstant(
- S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
+ F32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
K1 = B.buildFConstant(
- S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
+ F32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
}
auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = (Signed && SrcLT.isScalar(64)) ? B.buildFPTOSI(S32, FloorMul)
- : B.buildFPTOUI(S32, FloorMul);
- auto Lo = B.buildFPTOUI(S32, Fma);
+ auto Hi = (Signed && SrcLT.isFloat(64)) ? B.buildFPTOSI(I32, FloorMul)
+ : B.buildFPTOUI(I32, FloorMul);
+ auto Lo = B.buildFPTOUI(I32, Fma);
- if (Signed && SrcLT.isScalar(32)) {
+ if (Signed && SrcLT.isFloat(32)) {
// Flip the result based on the signedness, which is either all 0s or 1s.
- Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
+ Sign = B.buildMergeLikeInstr(I64, {Sign, Sign});
// r := xor({lo, hi}, sign) - sign;
- B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
+ B.buildSub(Dst, B.buildXor(I64, B.buildMergeLikeInstr(I64, {Lo, Hi}), Sign),
Sign);
} else
B.buildMergeLikeInstr(Dst, {Lo, Hi});
@@ -2737,7 +2808,7 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// vector of integers using ptrtoint (and inttoptr on the output) in order to
// drive the legalization forward.
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
- LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntTy = LLT::integer(EltTy.getSizeInBits());
LLT IntVecTy = VecTy.changeElementType(IntTy);
auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
@@ -2790,7 +2861,7 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// new value, and then inttoptr the result vector back. This will then allow
// the rest of legalization to take over.
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
- LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntTy = LLT::integer(EltTy.getSizeInBits());
LLT IntVecTy = VecTy.changeElementType(IntTy);
auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
@@ -2916,13 +2987,11 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
MachineRegisterInfo &MRI) const {
bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
- LLT S32 = LLT::scalar(32);
-
// Use the destination directly, if and only if we store the lower address
// part only and we don't have a register class being set.
Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
? DstReg
- : MRI.createGenericVirtualRegister(S32);
+ : MRI.createGenericVirtualRegister(I32);
if (!MRI.getRegClassOrNull(AddrLo))
MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
@@ -2937,7 +3006,7 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
assert(PtrTy.getSizeInBits() == 64 &&
"Must provide a 64-bit pointer type!");
- Register AddrHi = MRI.createGenericVirtualRegister(S32);
+ Register AddrHi = MRI.createGenericVirtualRegister(I32);
MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::S_MOV_B32)
@@ -2948,7 +3017,7 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
// class being set.
Register AddrDst = !MRI.getRegClassOrNull(DstReg)
? DstReg
- : MRI.createGenericVirtualRegister(LLT::scalar(64));
+ : MRI.createGenericVirtualRegister(I64);
if (!MRI.getRegClassOrNull(AddrDst))
MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
@@ -3017,8 +3086,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
// Adjust alignment for that dynamic shared memory array.
MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
- LLT S32 = LLT::scalar(32);
- auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
+ auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {I32});
B.buildIntToPtr(DstReg, Sz);
MI.eraseFromParent();
return true;
@@ -3078,7 +3146,7 @@ static LLT widenToNextPowerOf2(LLT Ty) {
if (Ty.isVector())
return Ty.changeElementCount(
ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
- return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
+ return LLT::integer(PowerOf2Ceil(Ty.getSizeInBits()));
}
bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
@@ -3258,7 +3326,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
break;
}
case TargetOpcode::G_FPEXT: {
- return MRI.getType(DefMI->getOperand(1).getReg()).isScalar(16);
+ return MRI.getType(DefMI->getOperand(1).getReg()).isFloat(16);
}
default:
return false;
@@ -3287,11 +3355,10 @@ AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
return {};
- const LLT F32 = LLT::scalar(32);
auto SmallestNormal = B.buildFConstant(
F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
auto IsLtSmallestNormal =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Src, SmallestNormal);
auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
auto One = B.buildFConstant(F32, 1.0);
@@ -3315,8 +3382,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();
- if (Ty.isScalar(16)) {
- const LLT F32 = LLT::scalar(32);
+ if (Ty.isFloat(16)) {
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
@@ -3327,7 +3393,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
return true;
}
- assert(Ty.isScalar(32));
+ assert(Ty.isFloat(32));
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
if (!ScaledInput) {
@@ -3370,9 +3436,6 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const LLT Ty = MRI.getType(X);
MachineFunction &MF = B.getMF();
- const LLT F32 = LLT::scalar(32);
- const LLT F16 = LLT::scalar(16);
-
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
@@ -3448,7 +3511,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
auto Fabs = B.buildFAbs(Ty, Y);
auto IsFinite =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Fabs, Inf, Flags);
R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
}
@@ -3474,7 +3537,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
LLT Ty = B.getMRI()->getType(Dst);
- if (Ty.isScalar(32)) {
+ if (Ty.isFloat(32)) {
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
if (ScaledInput) {
auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
@@ -3497,7 +3560,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
}
}
- auto Log2Operand = Ty.isScalar(16)
+ auto Log2Operand = Ty.isFloat(16)
? B.buildFLog2(Ty, Src, Flags)
: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
@@ -3516,8 +3579,6 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
Register Src = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
- const LLT F16 = LLT::scalar(16);
- const LLT F32 = LLT::scalar(32);
if (Ty == F16) {
// Nothing in half is a denormal when promoted to f32.
@@ -3568,7 +3629,6 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
- LLT F32 = LLT::scalar(32);
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
@@ -3613,8 +3673,6 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
MachineFunction &MF = B.getMF();
MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(Dst);
- const LLT F16 = LLT::scalar(16);
- const LLT F32 = LLT::scalar(32);
const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
if (Ty == F16) {
@@ -3715,7 +3773,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// It is unsafe to contract this fsub into the PH multiply.
auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
- auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
+ auto IntE = B.buildFPTOSI(I32, E);
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
.addUse(A.getReg(0))
@@ -3726,7 +3784,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
auto Zero = B.buildFConstant(Ty, 0.0);
auto Underflow =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, X, UnderflowCheckConst);
R = B.buildSelect(Ty, Underflow, Zero, R);
@@ -3737,7 +3795,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
auto Overflow =
- B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
+ B.buildFCmp(CmpInst::FCMP_OGT, I1, X, OverflowCheckConst);
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
}
@@ -3795,7 +3853,6 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S1 = LLT::scalar(1);
Register Dst = MI.getOperand(0).getReg();
Register OrigSrc = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
@@ -3836,7 +3893,7 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
Register CorrectedFract = Min;
if (!MI.getFlag(MachineInstr::FmNoNans)) {
- auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
+ auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, I1, ModSrc, ModSrc, Flags);
CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
}
@@ -3852,8 +3909,6 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeBuildVector(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
assert(MRI.getType(Dst).isFixedVector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
@@ -3861,11 +3916,11 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
assert(MRI.getType(Src0).isScalar(32));
- Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
- Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
+ Src0 = B.buildTrunc(I16, MI.getOperand(1).getReg()).getReg(0);
+ Src1 = B.buildTrunc(I16, MI.getOperand(2).getReg()).getReg(0);
}
- auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
+ auto Merge = B.buildMergeLikeInstr(I32, {Src0, Src1});
B.buildBitcast(Dst, Merge);
MI.eraseFromParent();
@@ -3894,21 +3949,17 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
MachineIRBuilder &B = Helper.MIRBuilder;
GISelKnownBits &KB = *Helper.getKnownBits();
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
-
Register Zero32;
Register Zero64;
auto getZero32 = [&]() -> Register {
if (!Zero32)
- Zero32 = B.buildConstant(S32, 0).getReg(0);
+ Zero32 = B.buildConstant(I32, 0).getReg(0);
return Zero32;
};
auto getZero64 = [&]() -> Register {
if (!Zero64)
- Zero64 = B.buildConstant(S64, 0).getReg(0);
+ Zero64 = B.buildConstant(I64, 0).getReg(0);
return Zero64;
};
@@ -3931,16 +3982,16 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
Register CarryAccum;
if (CarryIn.size() == 1) {
if (!LocalAccum) {
- LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ LocalAccum = B.buildZExt(I32, CarryIn[0]).getReg(0);
return Register();
}
CarryAccum = getZero32();
} else {
- CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ CarryAccum = B.buildZExt(I32, CarryIn[0]).getReg(0);
for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
CarryAccum =
- B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
+ B.buildUAdde(I32, I1, CarryAccum, getZero32(), CarryIn[i])
.getReg(0);
}
@@ -3951,7 +4002,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
}
auto Add =
- B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
+ B.buildUAdde(I32, I1, CarryAccum, LocalAccum, CarryIn.back());
LocalAccum = Add.getReg(0);
return HaveCarryOut ? Add.getReg(1) : Register();
};
@@ -3986,15 +4037,15 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
continue;
}
- auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
+ auto Mul = B.buildMul(I32, Src0[j0], Src1[j1]);
if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
LocalAccum[0] = Mul.getReg(0);
} else {
if (CarryIn.empty()) {
- LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
+ LocalAccum[0] = B.buildAdd(I32, LocalAccum[0], Mul).getReg(0);
} else {
LocalAccum[0] =
- B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
+ B.buildUAdde(I32, I1, LocalAccum[0], Mul, CarryIn.back())
.getReg(0);
CarryIn.pop_back();
}
@@ -4010,13 +4061,13 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
if (LocalAccum[0]) {
if (LocalAccum.size() == 1) {
- Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
+ Tmp = B.buildAnyExt(I64, LocalAccum[0]).getReg(0);
HaveSmallAccum = true;
} else if (LocalAccum[1]) {
- Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
+ Tmp = B.buildMergeLikeInstr(I64, LocalAccum).getReg(0);
HaveSmallAccum = false;
} else {
- Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
+ Tmp = B.buildZExt(I64, LocalAccum[0]).getReg(0);
HaveSmallAccum = true;
}
} else {
@@ -4031,7 +4082,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
continue;
}
- auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
+ auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {I64, I1},
{Src0[j0], Src1[j1], Tmp});
Tmp = Mad.getReg(0);
if (!HaveSmallAccum)
@@ -4041,7 +4092,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
} while (j0 <= DstIndex);
- auto Unmerge = B.buildUnmerge(S32, Tmp);
+ auto Unmerge = B.buildUnmerge(I32, Tmp);
LocalAccum[0] = Unmerge.getReg(0);
if (LocalAccum.size() > 1)
LocalAccum[1] = Unmerge.getReg(1);
@@ -4099,17 +4150,17 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
if (i == 1) {
if (!IsHighest)
- Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
+ Lo = B.buildUAddo(I32, I1, Accum[2 * i - 1], SeparateOddOut[0]);
else
- Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
+ Lo = B.buildAdd(I32, Accum[2 * i - 1], SeparateOddOut[0]);
} else {
- Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
+ Lo = B.buildUAdde(I32, I1, Accum[2 * i - 1], SeparateOddOut[0],
SeparateOddCarry);
}
Accum[2 * i - 1] = Lo->getOperand(0).getReg();
if (!IsHighest) {
- auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
+ auto Hi = B.buildUAdde(I32, I1, Accum[2 * i], SeparateOddOut[1],
Lo->getOperand(1).getReg());
Accum[2 * i] = Hi.getReg(0);
SeparateOddCarry = Hi.getReg(1);
@@ -4147,7 +4198,7 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
Register Src1 = MI.getOperand(2).getReg();
LLT Ty = MRI.getType(DstReg);
- assert(Ty.isScalar());
+ assert(Ty.isInteger());
unsigned Size = Ty.getSizeInBits();
unsigned NumParts = Size / 32;
@@ -4164,11 +4215,10 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
// in an even-aligned VGPR.
const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
- LLT S32 = LLT::scalar(32);
SmallVector<Register, 2> Src0Parts, Src1Parts;
for (unsigned i = 0; i < NumParts; ++i) {
- Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
- Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
+ Src0Parts.push_back(MRI.createGenericVirtualRegister(I32));
+ Src1Parts.push_back(MRI.createGenericVirtualRegister(I32));
}
B.buildUnmerge(Src0Parts, Src0);
B.buildUnmerge(Src1Parts, Src1);
@@ -4213,10 +4263,10 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
assert(NumBits < 32u);
- auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
- auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
- auto Shift = B.buildShl(S32, Extend, ShiftAmt);
- auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
+ auto ShiftAmt = B.buildConstant(I32, 32u - NumBits);
+ auto Extend = B.buildAnyExt(I32, {Src}).getReg(0u);
+ auto Shift = B.buildShl(I32, Extend, ShiftAmt);
+ auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {I32}, {Shift});
B.buildTrunc(Dst, Ctlz);
MI.eraseFromParent();
return true;
@@ -4285,7 +4335,6 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
*ArgRC, B.getDebugLoc(), ArgTy);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
- const LLT S32 = LLT::scalar(32);
const unsigned Mask = Arg->getMask();
const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
@@ -4294,11 +4343,11 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
// TODO: Avoid clearing the high bits if we know workitem id y/z are always
// 0.
if (Shift != 0) {
- auto ShiftAmt = B.buildConstant(S32, Shift);
- AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+ auto ShiftAmt = B.buildConstant(I32, Shift);
+ AndMaskSrc = B.buildLShr(I32, LiveIn, ShiftAmt).getReg(0);
}
- B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
+ B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(I32, Mask >> Shift));
} else {
B.buildCopy(DstReg, LiveIn);
}
@@ -4331,17 +4380,17 @@ bool AMDGPULegalizerInfo::loadInputValue(
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
Arg = &WorkGroupIDY;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
Arg = &WorkGroupIDZ;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
default:
break;
@@ -4469,11 +4518,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
- if (DstTy.isScalar(16))
+ if (DstTy.isFloat(16))
return legalizeFDIV16(MI, MRI, B);
- if (DstTy.isScalar(32))
+ if (DstTy.isFloat(32))
return legalizeFDIV32(MI, MRI, B);
- if (DstTy.isScalar(64))
+ if (DstTy.isFloat(64))
return legalizeFDIV64(MI, MRI, B);
return false;
@@ -4484,42 +4533,39 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
Register DstRemReg,
Register X,
Register Y) const {
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
-
// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
// algorithm used here.
// Initial estimate of inv(y).
- auto FloatY = B.buildUITOFP(S32, Y);
- auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
- auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
- auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
- auto Z = B.buildFPTOUI(S32, ScaledY);
+ auto FloatY = B.buildUITOFP(F32, Y);
+ auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {F32}, {FloatY});
+ auto Scale = B.buildFConstant(F32, llvm::bit_cast<float>(0x4f7ffffe));
+ auto ScaledY = B.buildFMul(F32, RcpIFlag, Scale);
+ auto Z = B.buildFPTOUI(I32, ScaledY);
// One round of UNR.
- auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
- auto NegYZ = B.buildMul(S32, NegY, Z);
- Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
+ auto NegY = B.buildSub(I32, B.buildConstant(I32, 0), Y);
+ auto NegYZ = B.buildMul(I32, NegY, Z);
+ Z = B.buildAdd(I32, Z, B.buildUMulH(I32, Z, NegYZ));
// Quotient/remainder estimate.
- auto Q = B.buildUMulH(S32, X, Z);
- auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
+ auto Q = B.buildUMulH(I32, X, Z);
+ auto R = B.buildSub(I32, X, B.buildMul(I32, Q, Y));
// First quotient/remainder refinement.
- auto One = B.buildConstant(S32, 1);
- auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ auto One = B.buildConstant(I32, 1);
+ auto Cond = B.buildICmp(CmpInst::ICMP_UGE, I1, R, Y);
if (DstDivReg)
- Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
- R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
+ Q = B.buildSelect(I32, Cond, B.buildAdd(I32, Q, One), Q);
+ R = B.buildSelect(I32, Cond, B.buildSub(I32, R, Y), R);
// Second quotient/remainder refinement.
- Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ Cond = B.buildICmp(CmpInst::ICMP_UGE, I1, R, Y);
if (DstDivReg)
- B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
+ B.buildSelect(DstDivReg, Cond, B.buildAdd(I32, Q, One), Q);
if (DstRemReg)
- B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
+ B.buildSelect(DstRemReg, Cond, B.buildSub(I32, R, Y), R);
}
// Build integer reciprocal sequence around V_RCP_IFLAG_F32
@@ -4537,32 +4583,31 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
Register Val) {
- const LLT S32 = LLT::scalar(32);
- auto Unmerge = B.buildUnmerge(S32, Val);
+ auto Unmerge = B.buildUnmerge(I32, Val);
- auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
- auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
+ auto CvtLo = B.buildUITOFP(F32, Unmerge.getReg(0));
+ auto CvtHi = B.buildUITOFP(F32, Unmerge.getReg(1));
auto Mad = B.buildFMAD(
- S32, CvtHi, // 2**32
- B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
+ F32, CvtHi, // 2**32
+ B.buildFConstant(F32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
- auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
+ auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {F32}, {Mad});
auto Mul1 = B.buildFMul(
- S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
+ F32, Rcp, B.buildFConstant(F32, llvm::bit_cast<float>(0x5f7ffffc)));
// 2**(-32)
auto Mul2 = B.buildFMul(
- S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
- auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
+ F32, Mul1, B.buildFConstant(F32, llvm::bit_cast<float>(0x2f800000)));
+ auto Trunc = B.buildIntrinsicTrunc(F32, Mul2);
// -(2**32)
auto Mad2 = B.buildFMAD(
- S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
+ F32, Trunc, B.buildFConstant(F32, llvm::bit_cast<float>(0xcf800000)),
Mul1);
- auto ResultLo = B.buildFPTOUI(S32, Mad2);
- auto ResultHi = B.buildFPTOUI(S32, Trunc);
+ auto ResultLo = B.buildFPTOUI(I32, Mad2);
+ auto ResultHi = B.buildFPTOUI(I32, Trunc);
return {ResultLo.getReg(0), ResultHi.getReg(0)};
}
@@ -4572,109 +4617,106 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
Register DstRemReg,
Register Numer,
Register Denom) const {
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
- const LLT S1 = LLT::scalar(1);
Register RcpLo, RcpHi;
std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
- auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
+ auto Rcp = B.buildMergeLikeInstr(I64, {RcpLo, RcpHi});
- auto Zero64 = B.buildConstant(S64, 0);
- auto NegDenom = B.buildSub(S64, Zero64, Denom);
+ auto Zero64 = B.buildConstant(I64, 0);
+ auto NegDenom = B.buildSub(I64, Zero64, Denom);
- auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
- auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
+ auto MulLo1 = B.buildMul(I64, NegDenom, Rcp);
+ auto MulHi1 = B.buildUMulH(I64, Rcp, MulLo1);
- auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
+ auto UnmergeMulHi1 = B.buildUnmerge(I32, MulHi1);
Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
- auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
- auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
- auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
+ auto Add1_Lo = B.buildUAddo(I32, I1, RcpLo, MulHi1_Lo);
+ auto Add1_Hi = B.buildUAdde(I32, I1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
+ auto Add1 = B.buildMergeLikeInstr(I64, {Add1_Lo, Add1_Hi});
- auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
- auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
- auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
+ auto MulLo2 = B.buildMul(I64, NegDenom, Add1);
+ auto MulHi2 = B.buildUMulH(I64, Add1, MulLo2);
+ auto UnmergeMulHi2 = B.buildUnmerge(I32, MulHi2);
Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
- auto Zero32 = B.buildConstant(S32, 0);
- auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
- auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
- auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
+ auto Zero32 = B.buildConstant(I32, 0);
+ auto Add2_Lo = B.buildUAddo(I32, I1, Add1_Lo, MulHi2_Lo);
+ auto Add2_Hi = B.buildUAdde(I32, I1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
+ auto Add2 = B.buildMergeLikeInstr(I64, {Add2_Lo, Add2_Hi});
- auto UnmergeNumer = B.buildUnmerge(S32, Numer);
+ auto UnmergeNumer = B.buildUnmerge(I32, Numer);
Register NumerLo = UnmergeNumer.getReg(0);
Register NumerHi = UnmergeNumer.getReg(1);
- auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
- auto Mul3 = B.buildMul(S64, Denom, MulHi3);
- auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
+ auto MulHi3 = B.buildUMulH(I64, Numer, Add2);
+ auto Mul3 = B.buildMul(I64, Denom, MulHi3);
+ auto UnmergeMul3 = B.buildUnmerge(I32, Mul3);
Register Mul3_Lo = UnmergeMul3.getReg(0);
Register Mul3_Hi = UnmergeMul3.getReg(1);
- auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
- auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
- auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
- auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
+ auto Sub1_Lo = B.buildUSubo(I32, I1, NumerLo, Mul3_Lo);
+ auto Sub1_Hi = B.buildUSube(I32, I1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
+ auto Sub1_Mi = B.buildSub(I32, NumerHi, Mul3_Hi);
+ auto Sub1 = B.buildMergeLikeInstr(I64, {Sub1_Lo, Sub1_Hi});
- auto UnmergeDenom = B.buildUnmerge(S32, Denom);
+ auto UnmergeDenom = B.buildUnmerge(I32, Denom);
Register DenomLo = UnmergeDenom.getReg(0);
Register DenomHi = UnmergeDenom.getReg(1);
- auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
- auto C1 = B.buildSExt(S32, CmpHi);
+ auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, I1, Sub1_Hi, DenomHi);
+ auto C1 = B.buildSExt(I32, CmpHi);
- auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
- auto C2 = B.buildSExt(S32, CmpLo);
+ auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, I1, Sub1_Lo, DenomLo);
+ auto C2 = B.buildSExt(I32, CmpLo);
- auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
- auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
+ auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, I1, Sub1_Hi, DenomHi);
+ auto C3 = B.buildSelect(I32, CmpEq, C2, C1);
// TODO: Here and below portions of the code can be enclosed into if/endif.
// Currently control flow is unconditional and we have 4 selects after
// potential endif to substitute PHIs.
// if C3 != 0 ...
- auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
- auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
- auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
- auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
+ auto Sub2_Lo = B.buildUSubo(I32, I1, Sub1_Lo, DenomLo);
+ auto Sub2_Mi = B.buildUSube(I32, I1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
+ auto Sub2_Hi = B.buildUSube(I32, I1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
+ auto Sub2 = B.buildMergeLikeInstr(I64, {Sub2_Lo, Sub2_Hi});
- auto One64 = B.buildConstant(S64, 1);
- auto Add3 = B.buildAdd(S64, MulHi3, One64);
+ auto One64 = B.buildConstant(I64, 1);
+ auto Add3 = B.buildAdd(I64, MulHi3, One64);
auto C4 =
- B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
+ B.buildSExt(I32, B.buildICmp(CmpInst::ICMP_UGE, I1, Sub2_Hi, DenomHi));
auto C5 =
- B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
+ B.buildSExt(I32, B.buildICmp(CmpInst::ICMP_UGE, I1, Sub2_Lo, DenomLo));
auto C6 = B.buildSelect(
- S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
+ I32, B.buildICmp(CmpInst::ICMP_EQ, I1, Sub2_Hi, DenomHi), C5, C4);
// if (C6 != 0)
- auto Add4 = B.buildAdd(S64, Add3, One64);
- auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
+ auto Add4 = B.buildAdd(I64, Add3, One64);
+ auto Sub3_Lo = B.buildUSubo(I32, I1, Sub2_Lo, DenomLo);
- auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
- auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
- auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
+ auto Sub3_Mi = B.buildUSube(I32, I1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
+ auto Sub3_Hi = B.buildUSube(I32, I1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
+ auto Sub3 = B.buildMergeLikeInstr(I64, {Sub3_Lo, Sub3_Hi});
// endif C6
// endif C3
if (DstDivReg) {
auto Sel1 = B.buildSelect(
- S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
- B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ I64, B.buildICmp(CmpInst::ICMP_NE, I1, C6, Zero32), Add4, Add3);
+ B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, I1, C3, Zero32),
Sel1, MulHi3);
}
if (DstRemReg) {
auto Sel2 = B.buildSelect(
- S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
- B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ I64, B.buildICmp(CmpInst::ICMP_NE, I1, C6, Zero32), Sub3, Sub2);
+ B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, I1, C3, Zero32),
Sel2, Sub1);
}
}
@@ -4706,9 +4748,9 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty.isScalar(32))
+ if (Ty.isInteger(32))
legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
- else if (Ty.isScalar(64))
+ else if (Ty.isInteger(64))
legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
@@ -4720,17 +4762,15 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S32 = LLT::scalar(32);
-
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (!Ty.isScalar(32) && !Ty.isScalar(64))
+ if (!Ty.isInteger(32) && !Ty.isInteger(64))
return false;
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
- auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
+ auto SignBitOffset = B.buildConstant(I32, Ty.getSizeInBits() - 1);
auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
@@ -4763,7 +4803,7 @@ bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
}
}
- if (Ty.isScalar(32))
+ if (Ty.isInteger(32))
legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
@@ -4895,9 +4935,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S16 = LLT::scalar(16);
- LLT S32 = LLT::scalar(32);
-
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
@@ -4911,27 +4948,28 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
// q16.u = opx(V_CVT_F16_F32, q32.u);
// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
- auto LHSExt = B.buildFPExt(S32, LHS, Flags);
- auto RHSExt = B.buildFPExt(S32, RHS, Flags);
- auto NegRHSExt = B.buildFNeg(S32, RHSExt);
- auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto LHSExt = B.buildFPExt(F32, LHS, Flags);
+ auto RHSExt = B.buildFPExt(F32, RHS, Flags);
+ auto NegRHSExt = B.buildFNeg(F32, RHSExt);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(RHSExt.getReg(0))
.setMIFlags(Flags);
- auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
+ auto Quot = B.buildFMul(F32, LHSExt, Rcp, Flags);
MachineInstrBuilder Err;
if (ST.hasMadMacF32Insts()) {
- Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
- Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
- Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Err = B.buildFMAD(F32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMAD(F32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMAD(F32, NegRHSExt, Quot, LHSExt, Flags);
} else {
- Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
- Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
- Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
- }
- auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
- Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
- Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
- auto RDst = B.buildFPTrunc(S16, Quot, Flags);
+ Err = B.buildFMA(F32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMA(F32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMA(F32, NegRHSExt, Quot, LHSExt, Flags);
+ }
+ auto Tmp = B.buildFMul(F32, Err, Rcp, Flags);
+ Tmp = B.buildAnd(I32, Tmp, B.buildConstant(I32, 0xff800000));
+ Tmp = B.buildBitcast(F32, Tmp);
+ Quot = B.buildFAdd(F32, Tmp, Quot, Flags);
+ auto RDst = B.buildFPTrunc(F16, Quot, Flags);
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
.addUse(RDst.getReg(0))
.addUse(RHS)
@@ -4983,28 +5021,25 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S32 = LLT::scalar(32);
- LLT S1 = LLT::scalar(1);
-
- auto One = B.buildFConstant(S32, 1.0f);
+ auto One = B.buildFConstant(F32, 1.0f);
auto DenominatorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F32, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(0)
.setMIFlags(Flags);
auto NumeratorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F32, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(1)
.setMIFlags(Flags);
- auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(DenominatorScaled.getReg(0))
.setMIFlags(Flags);
- auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
+ auto NegDivScale0 = B.buildFNeg(F32, DenominatorScaled, Flags);
const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
const bool HasDynamicDenormals =
@@ -5022,12 +5057,12 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
toggleSPDenormMode(true, B, ST, Mode);
}
- auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
- auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
- auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
- auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
- auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
- auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
+ auto Fma0 = B.buildFMA(F32, NegDivScale0, ApproxRcp, One, Flags);
+ auto Fma1 = B.buildFMA(F32, Fma0, ApproxRcp, ApproxRcp, Flags);
+ auto Mul = B.buildFMul(F32, NumeratorScaled, Fma1, Flags);
+ auto Fma2 = B.buildFMA(F32, NegDivScale0, Mul, NumeratorScaled, Flags);
+ auto Fma3 = B.buildFMA(F32, Fma2, Fma1, Mul, Flags);
+ auto Fma4 = B.buildFMA(F32, NegDivScale0, Fma3, NumeratorScaled, Flags);
if (!PreservesDenormals) {
if (HasDynamicDenormals) {
@@ -5039,7 +5074,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
toggleSPDenormMode(false, B, ST, Mode);
}
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {F32})
.addUse(Fma4.getReg(0))
.addUse(Fma1.getReg(0))
.addUse(Fma3.getReg(0))
@@ -5068,59 +5103,54 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S64 = LLT::scalar(64);
- LLT S1 = LLT::scalar(1);
-
- auto One = B.buildFConstant(S64, 1.0);
+ auto One = B.buildFConstant(F64, 1.0);
- auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F64, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(0)
.setMIFlags(Flags);
- auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
+ auto NegDivScale0 = B.buildFNeg(F64, DivScale0.getReg(0), Flags);
- auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F64})
.addUse(DivScale0.getReg(0))
.setMIFlags(Flags);
- auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
- auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
- auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
+ auto Fma0 = B.buildFMA(F64, NegDivScale0, Rcp, One, Flags);
+ auto Fma1 = B.buildFMA(F64, Rcp, Fma0, Rcp, Flags);
+ auto Fma2 = B.buildFMA(F64, NegDivScale0, Fma1, One, Flags);
- auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F64, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(1)
.setMIFlags(Flags);
- auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
- auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
- auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
+ auto Fma3 = B.buildFMA(F64, Fma1, Fma2, Fma1, Flags);
+ auto Mul = B.buildFMul(F64, DivScale1.getReg(0), Fma3, Flags);
+ auto Fma4 = B.buildFMA(F64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
Register Scale;
if (!ST.hasUsableDivScaleConditionOutput()) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
- LLT S32 = LLT::scalar(32);
-
- auto NumUnmerge = B.buildUnmerge(S32, LHS);
- auto DenUnmerge = B.buildUnmerge(S32, RHS);
- auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
- auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
+ auto NumUnmerge = B.buildUnmerge(I32, LHS);
+ auto DenUnmerge = B.buildUnmerge(I32, RHS);
+ auto Scale0Unmerge = B.buildUnmerge(I32, DivScale0);
+ auto Scale1Unmerge = B.buildUnmerge(I32, DivScale1);
- auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
+ auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, I1, NumUnmerge.getReg(1),
Scale1Unmerge.getReg(1));
- auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
+ auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, I1, DenUnmerge.getReg(1),
Scale0Unmerge.getReg(1));
- Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
+ Scale = B.buildXor(I1, CmpNum, CmpDen).getReg(0);
} else {
Scale = DivScale1.getReg(1);
}
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {F64})
.addUse(Fma4.getReg(0))
.addUse(Fma3.getReg(0))
.addUse(Mul.getReg(0))
@@ -5146,7 +5176,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT Ty = MRI.getType(Res0);
- LLT InstrExpTy = Ty.isScalar(16) ? LLT::scalar(16) : LLT::scalar(32);
+ LLT InstrExpTy = Ty.isScalar(16) ? I16 : I32;
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
@@ -5159,7 +5189,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
auto Fabs = B.buildFAbs(Ty, Val);
auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
auto IsFinite =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Fabs, Inf, Flags);
auto Zero = B.buildConstant(InstrExpTy, 0);
Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
@@ -5180,26 +5210,23 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
Register RHS = MI.getOperand(3).getReg();
uint16_t Flags = MI.getFlags();
- LLT S32 = LLT::scalar(32);
- LLT S1 = LLT::scalar(1);
-
- auto Abs = B.buildFAbs(S32, RHS, Flags);
+ auto Abs = B.buildFAbs(F32, RHS, Flags);
const APFloat C0Val(1.0f);
- auto C0 = B.buildFConstant(S32, 0x1p+96f);
- auto C1 = B.buildFConstant(S32, 0x1p-32f);
- auto C2 = B.buildFConstant(S32, 1.0f);
+ auto C0 = B.buildFConstant(F32, 0x1p+96f);
+ auto C1 = B.buildFConstant(F32, 0x1p-32f);
+ auto C2 = B.buildFConstant(F32, 1.0f);
- auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
- auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
+ auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, I1, Abs, C0, Flags);
+ auto Sel = B.buildSelect(F32, CmpRes, C1, C2, Flags);
- auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
+ auto Mul0 = B.buildFMul(F32, RHS, Sel, Flags);
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(Mul0.getReg(0))
.setMIFlags(Flags);
- auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
+ auto Mul1 = B.buildFMul(F32, LHS, RCP, Flags);
B.buildFMul(Res, Sel, Mul1, Flags);
@@ -5214,7 +5241,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
// get. The f32 op is accurate enough for the f16 cas.
unsigned Flags = MI.getFlags();
assert(!ST.has16BitInsts());
- const LLT F32 = LLT::scalar(32);
auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
.addUse(Ext.getReg(0))
@@ -5231,9 +5257,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
Register X = MI.getOperand(1).getReg();
const unsigned Flags = MI.getFlags();
- const LLT S1 = LLT::scalar(1);
- const LLT F32 = LLT::scalar(32);
- const LLT I32 = LLT::scalar(32);
if (allowApproxFunc(MF, Flags)) {
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
@@ -5244,7 +5267,7 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
}
auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
- auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
+ auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, I1, ScaleThreshold, X, Flags);
auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
@@ -5254,26 +5277,27 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
.addUse(SqrtX.getReg(0))
.setMIFlags(Flags);
+ auto SqrtSInt = B.buildBitcast(I32, SqrtS);
auto NegOne = B.buildConstant(I32, -1);
- auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
+ auto SqrtSNextDown = B.buildBitcast(F32, B.buildAdd(I32, SqrtSInt, NegOne));
auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
auto PosOne = B.buildConstant(I32, 1);
- auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
+ auto SqrtSNextUp = B.buildBitcast(F32, B.buildAdd(I32, SqrtSInt, PosOne));
auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
auto Zero = B.buildFConstant(F32, 0.0f);
- auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
+ auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, I1, SqrtVP, Zero, Flags);
SqrtS =
B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
- auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
+ auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, I1, SqrtVS, Zero, Flags);
SqrtS =
B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
} else {
@@ -5298,7 +5322,7 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
- auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+ auto IsZeroOrInf = B.buildIsFPClass(I1, SqrtX, fcZero | fcPosInf);
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
MI.eraseFromParent();
@@ -5328,10 +5352,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
//
// sqrt(x) = g3
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT F64 = LLT::scalar(64);
-
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
@@ -5340,12 +5360,12 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
- auto ZeroInt = B.buildConstant(S32, 0);
- auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+ auto ZeroInt = B.buildConstant(I32, 0);
+ auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, I1, X, ScaleConstant);
// Scale up input if it is too small.
- auto ScaleUpFactor = B.buildConstant(S32, 256);
- auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+ auto ScaleUpFactor = B.buildConstant(I32, 256);
+ auto ScaleUp = B.buildSelect(I32, Scaling, ScaleUpFactor, ZeroInt);
auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
auto SqrtY =
@@ -5372,15 +5392,15 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
// Scale down the result.
- auto ScaleDownFactor = B.buildConstant(S32, -128);
- auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+ auto ScaleDownFactor = B.buildConstant(I32, -128);
+ auto ScaleDown = B.buildSelect(I32, Scaling, ScaleDownFactor, ZeroInt);
SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
// with finite only or nsz because rsq(+/-0) = +/-inf
// TODO: Check for DAZ and expand to subnormals
- auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+ auto IsZeroOrInf = B.buildIsFPClass(I1, SqrtX, fcZero | fcPosInf);
// If x is +INF, +0, or -0, use its original value
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
@@ -5393,11 +5413,11 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty.isScalar(32))
+ if (Ty.isFloat(32))
return legalizeFSQRTF32(MI, MRI, B);
- if (Ty.isScalar(64))
+ if (Ty.isFloat(64))
return legalizeFSQRTF64(MI, MRI, B);
- if (Ty.isScalar(16))
+ if (Ty.isFloat(16))
return legalizeFSQRTF16(MI, MRI, B);
return false;
}
@@ -5420,13 +5440,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
LLT Ty = MRI.getType(Dst);
- const fltSemantics *FltSemantics;
- if (Ty.isScalar(32))
- FltSemantics = &APFloat::IEEEsingle();
- else if (Ty.isScalar(64))
- FltSemantics = &APFloat::IEEEdouble();
- else
- return false;
+ if (!Ty.isFloat())
+ return false;
+
+ const llvm::fltSemantics &FltSemantics = getFltSemanticForLLT(Ty);
auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
.addUse(Src)
@@ -5437,11 +5454,11 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
const bool UseIEEE = MFI->getMode().IEEE;
- auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(FltSemantics));
auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
- auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(FltSemantics, true));
if (UseIEEE)
B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
@@ -5529,15 +5546,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
if (Size < 32) {
- Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+ Src0 = B.buildAnyExt(I32, Src0).getReg(0);
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
- Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
+ Src1 = B.buildAnyExt(I32, Src1).getReg(0);
if (IID == Intrinsic::amdgcn_writelane)
- Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+ Src2 = B.buildAnyExt(I32, Src2).getReg(0);
- Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, I32);
B.buildTrunc(DstReg, LaneOpDst);
MI.eraseFromParent();
return true;
@@ -5546,7 +5563,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size % SplitSize != 0)
return false;
- LLT PartialResTy = LLT::scalar(SplitSize);
+ LLT PartialResTy = LLT::integer(SplitSize);
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
@@ -5594,7 +5611,7 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
LLT DstTy = MRI.getType(DstReg);
- LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
+ LLT IdxTy = LLT::integer(DstTy.getSizeInBits());
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
if (!loadInputValue(KernargPtrReg, B,
@@ -5618,15 +5635,13 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register NumRecords = MI.getOperand(4).getReg();
Register Flags = MI.getOperand(5).getReg();
- LLT S32 = LLT::scalar(32);
-
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto Unmerge = B.buildUnmerge(I32, Pointer);
Register LowHalf = Unmerge.getReg(0);
Register HighHalf = Unmerge.getReg(1);
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+ auto AndMask = B.buildConstant(I32, 0x0000ffff);
+ auto Masked = B.buildAnd(I32, HighHalf, AndMask);
MachineInstrBuilder NewHighHalf = Masked;
std::optional<ValueAndVReg> StrideConst =
@@ -5636,13 +5651,13 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
if (StrideConst) {
uint32_t StrideVal = StrideConst->Value.getZExtValue();
uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ ShiftedStride = B.buildConstant(I32, ShiftedStrideVal);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(I32, Stride);
+ auto ShiftConst = B.buildConstant(I32, 16);
+ ShiftedStride = B.buildShl(I32, ExtStride, ShiftConst);
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ NewHighHalf = B.buildOr(I32, Masked, ShiftedStride);
}
Register NewHighHalfReg = NewHighHalf.getReg(0);
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
@@ -5701,7 +5716,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ auto Unmerge = B.buildUnmerge(I32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
@@ -5721,7 +5736,6 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
Register BaseReg;
unsigned ImmOffset;
- const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = *B.getMRI();
std::tie(BaseReg, ImmOffset) =
@@ -5748,15 +5762,15 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
if (Overflow != 0) {
if (!BaseReg) {
- BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+ BaseReg = B.buildConstant(I32, Overflow).getReg(0);
} else {
- auto OverflowVal = B.buildConstant(S32, Overflow);
- BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+ auto OverflowVal = B.buildConstant(I32, Overflow);
+ BaseReg = B.buildAdd(I32, BaseReg, OverflowVal).getReg(0);
}
}
if (!BaseReg)
- BaseReg = B.buildConstant(S32, 0).getReg(0);
+ BaseReg = B.buildConstant(I32, 0).getReg(0);
return std::pair(BaseReg, ImmOffset);
}
@@ -5766,52 +5780,50 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Reg,
bool ImageStore) const {
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
- assert(StoreVT.isVector() && StoreVT.getElementType().isScalar(16));
+ assert(StoreVT.isVector() && StoreVT.getElementType().isFloat(16));
if (ST.hasUnpackedD16VMem()) {
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ auto Unmerge = B.buildUnmerge(I16, Reg);
SmallVector<Register, 4> WideRegs;
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
- WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+ WideRegs.push_back(B.buildAnyExt(I32, Unmerge.getReg(I)).getReg(0));
int NumElts = StoreVT.getNumElements();
- return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
+ return B.buildBuildVector(LLT::fixed_vector(NumElts, I32), WideRegs)
.getReg(0);
}
if (ImageStore && ST.hasImageStoreD16Bug()) {
if (StoreVT.getNumElements() == 2) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(S32, Reg).getReg(0);
+ Reg = B.buildBitcast(I32, Reg).getReg(0);
PackedRegs.push_back(Reg);
- PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
+ PackedRegs.resize(2, B.buildUndef(I32).getReg(0));
+ return B.buildBuildVector(V2I32, PackedRegs)
.getReg(0);
}
if (StoreVT.getNumElements() == 3) {
SmallVector<Register, 4> PackedRegs;
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ auto Unmerge = B.buildUnmerge(I16, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
- PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
- Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
- return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
+ PackedRegs.resize(6, B.buildUndef(I16).getReg(0));
+ Reg = B.buildBuildVector(V6I16, PackedRegs).getReg(0);
+ return B.buildBitcast(V3I32, Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
- auto Unmerge = B.buildUnmerge(S32, Reg);
+ Reg = B.buildBitcast(V2I32, Reg).getReg(0);
+ auto Unmerge = B.buildUnmerge(I32, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
- PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
+ PackedRegs.resize(4, B.buildUndef(I32).getReg(0));
+ return B.buildBuildVector(V4I32, PackedRegs)
.getReg(0);
}
@@ -5819,8 +5831,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
}
if (StoreVT.isFixedVector(3, 16)) {
- Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
- .getReg(0);
+ Reg = B.buildPadVectorWithUndefElements(V4I16, Reg).getReg(0);
}
return Reg;
}
@@ -5841,7 +5852,9 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
}
// Fixup illegal register types for i8 stores.
if (Ty.isScalar(8) || Ty.isScalar(16)) {
- Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
+ if (Ty.isFloat())
+ VData = B.buildBitcast(I16, VData).getReg(0);
+ Register AnyExt = B.buildAnyExt(I32, VData).getReg(0);
return AnyExt;
}
@@ -5866,7 +5879,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
LLT Ty = MRI.getType(VData);
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
- const LLT S32 = LLT::scalar(32);
MachineMemOperand *MMO = *MI.memoperands_begin();
const int MemSize = MMO->getSize().getValue();
@@ -5890,7 +5902,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
} else {
- VIndex = B.buildConstant(S32, 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -5978,7 +5990,6 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
const LLT MemTy = MMO->getMemoryType();
- const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
@@ -6004,7 +6015,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
VIndex = MI.getOperand(3 + OpOffset).getReg();
++OpOffset;
} else {
- VIndex = B.buildConstant(S32, 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -6081,20 +6092,24 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
if (IsTFE) {
unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
unsigned NumLoadDWords = NumValueDWords + 1;
- LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
+ LLT LoadTy = LLT::fixed_vector(NumLoadDWords, I32);
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
if (MemTy.getSizeInBits() < 32) {
- Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
+ Register ExtDst = B.getMRI()->createGenericVirtualRegister(I32);
B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
- B.buildTrunc(Dst, ExtDst);
+ if (Ty.isFloat()) {
+ B.buildBitcast(Dst, B.buildTrunc(I16, ExtDst));
+ } else {
+ B.buildTrunc(Dst, ExtDst);
+ }
} else if (NumValueDWords == 1) {
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
} else {
SmallVector<Register, 5> LoadElts;
for (unsigned I = 0; I != NumValueDWords; ++I)
- LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
+ LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(I32));
LoadElts.push_back(StatusDst);
B.buildUnmerge(LoadElts, LoadDstReg);
LoadElts.truncate(NumValueDWords);
@@ -6102,23 +6117,33 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
}
} else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
(IsD16 && !Ty.isVector())) {
- Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
+ Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(I32);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- B.buildTrunc(Dst, LoadDstReg);
+ if (Ty.isFloat()) {
+ B.buildBitcast(Dst, B.buildTrunc(I16, LoadDstReg));
+ } else {
+ B.buildTrunc(Dst, LoadDstReg);
+ }
} else if (Unpacked && IsD16 && Ty.isVector()) {
- LLT UnpackedTy = Ty.changeElementSize(32);
+ LLT UnpackedTy = Ty.changeElementType(I32);
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
// FIXME: G_TRUNC should work, but legalization currently fails
- auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
+ auto Unmerge = B.buildUnmerge(I32, LoadDstReg);
SmallVector<Register, 4> Repack;
for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
- B.buildMergeLikeInstr(Dst, Repack);
+
+ if (Ty.isFloatVector()) {
+ B.buildBitcast(Dst, B.buildMergeLikeInstr(Ty.changeElementType(I16), Repack));
+ } else {
+ B.buildMergeLikeInstr(Dst, Repack);
+ }
+
} else {
buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
AuxiliaryData, MMO, IsTyped, HasVIndex, B);
@@ -6251,7 +6276,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
VIndex = MI.getOperand(4 + OpOffset).getReg();
++OpOffset;
} else {
- VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
@@ -6290,8 +6315,6 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
unsigned ArgOffset,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
bool IsA16, bool IsG16) {
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
auto EndIdx = Intr->VAddrEnd;
for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
@@ -6310,13 +6333,13 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ B.buildBuildVector(V2I16, {AddrReg, B.buildUndef(I16).getReg(0)})
.getReg(0));
} else {
assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
"Bias needs to be converted to 16 bit in A16 mode");
// Handle any gradient or coordinate operands that should not be packed
- AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+ AddrReg = B.buildBitcast(V2I16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
}
} else {
@@ -6331,12 +6354,12 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
// Check for _L to _LZ optimization
!MI.getOperand(ArgOffset + I + 1).isReg()) {
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ B.buildBuildVector(V2I16, {AddrReg, B.buildUndef(I16).getReg(0)})
.getReg(0));
} else {
PackedAddrs.push_back(
B.buildBuildVector(
- V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
+ V2I16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
.getReg(0));
++I;
}
@@ -6348,8 +6371,6 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
/// and replace the remaining operands with $noreg.
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int DimIdx, int NumVAddrs) {
- const LLT S32 = LLT::scalar(32);
- (void)S32;
SmallVector<Register, 8> AddrRegs;
for (int I = 0; I != NumVAddrs; ++I) {
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
@@ -6362,7 +6383,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int NumAddrRegs = AddrRegs.size();
if (NumAddrRegs != 1) {
auto VAddr =
- B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
+ B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, I32), AddrRegs);
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
}
@@ -6402,10 +6423,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
MachineRegisterInfo *MRI = B.getMRI();
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
-
unsigned DMask = 0;
Register VData;
LLT Ty;
@@ -6520,13 +6537,13 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (UsePartialNSA) {
// Pack registers that would go over NSAMaxSize into last VAddr register
LLT PackedAddrTy =
- LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
+ LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), I16);
auto Concat = B.buildConcatVectors(
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
PackedRegs.resize(NSAMaxSize);
} else if (!UseNSA && PackedRegs.size() > 1) {
- LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
+ LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), I16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
@@ -6635,17 +6652,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsD16 && ST.hasUnpackedD16VMem()) {
RoundedTy =
- LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
- TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
- RegTy = S32;
+ LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), I32);
+ TFETy = LLT::fixed_vector(AdjustedNumElts + 1, I32);
+ RegTy = I32;
} else {
unsigned EltSize = EltTy.getSizeInBits();
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
RoundedTy = LLT::scalarOrVector(
- ElementCount::getFixed(RoundedSize / EltSize), EltSize);
- TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
- RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
+ ElementCount::getFixed(RoundedSize / EltSize), EltTy);
+ TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, I32);
+ RegTy = !IsTFE && EltSize == 16 ? V2I16 : I32;
}
// The return type does not need adjustment.
@@ -6681,10 +6698,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MI.removeOperand(1);
// Handle the easy case that requires no repack instructions.
- if (Ty.isScalar(32)) {
+ if (Ty.isInteger(32)) {
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
return true;
}
+
+ if (Ty.isFloat(32)) {
+ auto DstI32 = MRI->createGenericVirtualRegister(I32);
+ B.buildUnmerge({DstI32, Dst1Reg}, NewResultReg);
+ B.buildBitcast(DstReg, DstI32);
+ return true;
+ }
}
// Now figure out how to copy the new result register back into the old
@@ -6731,10 +6755,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// cast for the TFE result if a multiple of v2s16 was used.
if (!RegTy.isFixedVector(2, 16) && !ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
- Reg = B.buildBitcast(V2S16, Reg).getReg(0);
+ Reg = B.buildBitcast(V2I16, Reg).getReg(0);
} else if (ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
- Reg = B.buildTrunc(S16, Reg).getReg(0);
+ Reg = B.buildTrunc(I16, Reg).getReg(0);
}
}
@@ -6763,8 +6787,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (ResultRegs.size() == 1) {
NewResultReg = ResultRegs[0];
} else if (ResultRegs.size() == 2) {
- LLT V4S16 = LLT::fixed_vector(4, 16);
- NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
+ NewResultReg = B.buildConcatVectors(V4I16, ResultRegs).getReg(0);
} else {
return false;
}
@@ -6801,7 +6824,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
// destination register.
- Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ Dst = B.getMRI()->createGenericVirtualRegister(I32);
} else {
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
Dst = OrigDst;
@@ -6814,7 +6837,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
B.setInsertPt(B.getMBB(), MI);
}
- if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
+ if (shouldBitcastLoadStoreType(ST, Ty, LLT::integer(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
B.setInsertPt(B.getMBB(), MI);
@@ -6849,7 +6872,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
- Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
+ Helper.widenScalarDst(MI, getPow2IntegerType(Ty), 0);
}
Observer.changedInstr(MI);
@@ -6912,8 +6935,6 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
- const LLT S64 = LLT::scalar(64);
-
Register SGPR01(AMDGPU::SGPR0_SGPR1);
// For code object version 5, queue_ptr is passed through implicit kernarg.
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
@@ -6936,15 +6957,15 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(64), commonAlignment(Align(64), Offset));
+ I64, commonAlignment(Align(64), Offset));
// Pointer address
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildConstant(I64, Offset).getReg(0));
// Load address
- Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
+ Register Temp = B.buildLoad(I64, LoadAddr, *MMO).getReg(0);
B.buildCopy(SGPR01, Temp);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
@@ -7012,11 +7033,6 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
MachineIRBuilder &B) const {
MachineRegisterInfo &MRI = *B.getMRI();
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
- const LLT V3S32 = LLT::fixed_vector(3, 32);
-
Register DstReg = MI.getOperand(0).getReg();
Register NodePtr = MI.getOperand(2).getReg();
Register RayExtent = MI.getOperand(3).getReg();
@@ -7066,10 +7082,10 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
SmallVector<Register, 12> Ops;
if (UseNSA && IsGFX11Plus) {
- auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ auto packLanes = [&Ops, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({I32, I32, I32}, Src);
auto Merged = B.buildMergeLikeInstr(
- V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
+ V3I32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
Ops.push_back(Merged.getReg(0));
};
@@ -7078,20 +7094,20 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, RayInvDir);
auto MergedDir = B.buildMergeLikeInstr(
- V3S32,
+ V3I32,
{B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(0),
UnmergeRayDir.getReg(0)}))
.getReg(0),
B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(1),
UnmergeRayDir.getReg(1)}))
.getReg(0),
B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(2),
UnmergeRayDir.getReg(2)}))
.getReg(0)});
Ops.push_back(MergedDir.getReg(0));
@@ -7101,7 +7117,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
} else {
if (Is64) {
- auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+ auto Unmerge = B.buildUnmerge({I32, I32}, NodePtr);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
} else {
@@ -7109,8 +7125,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
Ops.push_back(RayExtent);
- auto packLanes = [&Ops, &S32, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ auto packLanes = [&Ops, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({I32, I32, I32}, Src);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));
@@ -7118,11 +7134,11 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
- Register R1 = MRI.createGenericVirtualRegister(S32);
- Register R2 = MRI.createGenericVirtualRegister(S32);
- Register R3 = MRI.createGenericVirtualRegister(S32);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, RayInvDir);
+ Register R1 = MRI.createGenericVirtualRegister(I32);
+ Register R2 = MRI.createGenericVirtualRegister(I32);
+ Register R3 = MRI.createGenericVirtualRegister(I32);
B.buildMergeLikeInstr(R1,
{UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
B.buildMergeLikeInstr(
@@ -7140,7 +7156,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
if (!UseNSA) {
// Build a single vector containing all the operands so far prepared.
- LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
+ LLT OpTy = LLT::fixed_vector(Ops.size(), I32);
Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
Ops.clear();
Ops.push_back(MergedOps);
@@ -7177,11 +7193,10 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
if (!ST.hasArchitectedSGPRs())
return false;
- LLT S32 = LLT::scalar(32);
Register DstReg = MI.getOperand(0).getReg();
- auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
- auto LSB = B.buildConstant(S32, 25);
- auto Width = B.buildConstant(S32, 5);
+ auto TTMP8 = B.buildCopy(I32, Register(AMDGPU::TTMP8));
+ auto LSB = B.buildConstant(I32, 25);
+ auto Width = B.buildConstant(I32, 5);
B.buildUbfx(DstReg, TTMP8, LSB, Width);
MI.eraseFromParent();
return true;
@@ -7197,15 +7212,15 @@ bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
- if (MRI.getType(Src) != S64)
+ if (MRI.getType(Src) != I64)
return false;
auto ModeReg =
- B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
+ B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {I32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvModeBitField);
auto TrapReg =
- B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
+ B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {I32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvTrapBitField);
B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
@@ -7220,7 +7235,7 @@ bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
if (!MRI.getType(Src).isScalar(64))
return false;
- auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
+ auto Unmerge = B.buildUnmerge({I32, I32}, MI.getOperand(0));
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(static_cast<int16_t>(FPEnvModeBitField))
@@ -7524,18 +7539,16 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
- LLT S32 = LLT::scalar(32);
if (!MRI.getType(Index).isScalar(32))
- MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ MI.getOperand(5).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
- LLT S32 = LLT::scalar(32);
if (!MRI.getType(Index).isScalar(32))
- MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ MI.getOperand(7).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_fmed3: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 107d0f8c495032..bbff60b5a31870 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -226,20 +226,21 @@ bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
MachineInstr &MI) const {
- const LLT S32 = LLT::scalar(32);
+ const LLT I32 = LLT::integer(32);
+ const LLT F32 = LLT::float32();
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
- if (!SrcTy.isScalar(32))
- SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
+ if (!SrcTy.isInteger(32))
+ SrcReg = B.buildAnyExtOrTrunc(I32, SrcReg).getReg(0);
- if (Ty.isScalar(32)) {
+ if (Ty.isFloat(32)) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
MI.getFlags());
} else {
- auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
+ auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {F32}, {SrcReg},
MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}
>From e205274f150802132787ee438fc681708209d0d7 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Tue, 17 Dec 2024 10:35:26 +0000
Subject: [PATCH 09/12] add custom bf16 lowering for fpext and fptrunc
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 50 +++++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 ++
2 files changed, 53 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2075fd0d27dd2c..bea04ecf574745 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1115,10 +1115,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
{{F32, F64}, {F16, F32}, {V2F16, V2F32}, {V2F16, V2F64}});
else
FPTruncActions.legalFor({{F32, F64}, {F16, F32}});
+ FPTruncActions.customFor({{BF16, F32}});
FPTruncActions.scalarize(0).lower();
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{F64, F32}, {F32, F16}})
+ .customFor({{F32, BF16}})
.narrowScalarFor({{I64, I16}}, changeTo(0, I32))
.scalarize(0);
@@ -2230,6 +2232,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
+ case TargetOpcode::G_FPEXT:
+ return legalizeFPExt(MI, MRI, B);
+ case TargetOpcode::G_FPTRUNC:
+ return legalizeFPTrunc(MI, MRI, B);
case TargetOpcode::G_STORE:
return legalizeStore(Helper, MI);
case TargetOpcode::G_FMAD:
@@ -3256,6 +3262,50 @@ bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
return false;
}
+bool AMDGPULegalizerInfo::legalizeFPExt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: move to LegalizerHelper
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto ShiftTy = TLI->getPreferredShiftAmountTy(I32);
+
+ B.buildBitcast(
+ DstReg, B.buildShl(I32, B.buildAnyExt(I32, B.buildBitcast(I16, SrcReg)),
+ B.buildConstant(ShiftTy, 16)));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFPTrunc(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: move to LegalizerHelper
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto ShiftTy = TLI->getPreferredShiftAmountTy(I32);
+
+ // FIXME:
+ // if (!DAG.isKnownNeverSNaN(Op)) {
+ // Op = DAG.getNode(ISD::FCANONICALIZE, dl, MVT::f32, Op,
+ // Node->getFlags());
+ // }
+
+ B.buildBitcast(
+ DstReg, B.buildTrunc(I16, B.buildLShr(I32, B.buildBitcast(I32, SrcReg),
+ B.buildConstant(ShiftTy, 16))));
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFMad(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 86c15197805d23..2deda39224abea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -77,6 +77,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const;
+ bool legalizeFPExt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
+ bool legalizeFPTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
+
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
>From 875848b8f01cdb068169ffa34bcbcb7d3f2e7e69 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Mon, 23 Dec 2024 13:15:57 +0000
Subject: [PATCH 10/12] fix some intrinsics
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 55 ++++++++++++-------
1 file changed, 35 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bea04ecf574745..119b7e6e59f8b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,9 +26,11 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -73,7 +75,9 @@ static const LLT I1024 = LLT::integer(1024);
static const LLT MaxScalar = LLT::integer(MaxRegisterSize);
static const LLT V2I8 = LLT::fixed_vector(2, I8);
+
static const LLT V2I16 = LLT::fixed_vector(2, I16);
+static const LLT V3I16 = LLT::fixed_vector(3, I16);
static const LLT V4I16 = LLT::fixed_vector(4, I16);
static const LLT V6I16 = LLT::fixed_vector(6, I16);
static const LLT V8I16 = LLT::fixed_vector(8, I16);
@@ -1973,7 +1977,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalForCartesianProduct(AllS64Vectors, {F64})
.clampNumElements(0, V16I32, V32I32)
.clampNumElements(0, V2I64, V16I64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2I16))
+ .fewerElementsIf(isWideVec16(0), changeElementCountTo(0, V2I16))
.moreElementsIf(
isIllegalRegisterType(0),
moreElementsToNextExistingRegClass(0));
@@ -2533,7 +2537,7 @@ bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
- assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
+ assert(Ty.isFloat() && Ty.getSizeInBits() == 64);
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
@@ -2732,13 +2736,13 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
//
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
MachineInstrBuilder Sign;
- if (Signed && SrcLT.isScalar(32)) {
+ if (Signed && SrcLT.isFloat(32)) {
// However, a 32-bit floating point number has only 23 bits mantissa and
// it's not enough to hold all the significant bits of `lof` if val is
// negative. To avoid the loss of precision, We need to take the absolute
// value after truncating and flip the result back based on the original
// signedness.
- Sign = B.buildAShr(I32, Src, B.buildConstant(I32, 31));
+ Sign = B.buildAShr(I32, B.buildBitcast(I32, Src), B.buildConstant(I32, 31));
Trunc = B.buildFAbs(F32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
@@ -3697,7 +3701,7 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
auto NeedsScaling =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::integer(1), X, Threshold, Flags);
auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
@@ -4537,7 +4541,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
llvm_unreachable("failed to find kernarg segment ptr");
- auto COffset = B.buildConstant(LLT::scalar(64), Offset);
+ auto COffset = B.buildConstant(LLT::integer(64), Offset);
// TODO: Should get nuw
return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
}
@@ -5226,7 +5230,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT Ty = MRI.getType(Res0);
- LLT InstrExpTy = Ty.isScalar(16) ? I16 : I32;
+ LLT InstrExpTy = Ty.isFloat(16) ? I16 : I32;
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
@@ -6492,9 +6496,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT AddrTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 =
- ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isScalar(16)) : GradTy.isScalar(16);
- const bool IsA16 = AddrTy.isScalar(16);
- const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isScalar(16);
+ ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isInteger(16)) : GradTy.isInteger(16);
+ const bool IsA16 = AddrTy.isInteger(16);
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isFloat(16);
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6912,7 +6916,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
if (Dst != OrigDst) {
MI.getOperand(0).setReg(Dst);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- B.buildTrunc(OrigDst, Dst);
+
+ if (Ty.isFloat()) {
+ auto Trunc = B.buildTrunc(Ty.dropType(), Dst);
+ B.buildBitcast(OrigDst, Trunc);
+ } else {
+ B.buildTrunc(OrigDst, Dst);
+ }
}
// If we don't have 96-bit result scalar loads, widening to 128-bit should
@@ -7102,8 +7112,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
const bool IsGFX11 = AMDGPU::isGFX11(ST);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
- const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
- const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+ const bool IsA16 = MRI.getType(RayDir).getElementType().isFloat(16);
+ const bool Is64 = MRI.getType(NodePtr).isInteger(64);
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
@@ -7144,8 +7154,10 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, RayInvDir);
+ auto BitcastRayDir = B.buildBitcast(V3I16, RayDir);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, BitcastRayDir);
+ auto BitcastRayInvDir = B.buildBitcast(V3I16, RayInvDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, BitcastRayInvDir);
auto MergedDir = B.buildMergeLikeInstr(
V3I32,
{B.buildBitcast(
@@ -7176,7 +7188,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
Ops.push_back(RayExtent);
auto packLanes = [&Ops, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({I32, I32, I32}, Src);
+ auto Bitcast = B.buildBitcast(V3I32, Src);
+ auto Unmerge = B.buildUnmerge({I32, I32, I32}, Bitcast);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));
@@ -7184,8 +7197,10 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, RayInvDir);
+ auto BitcastRayDir = B.buildBitcast(V3I16, RayDir);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, BitcastRayDir);
+ auto BitcastRayInvDir = B.buildBitcast(V3I16, RayInvDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, BitcastRayInvDir);
Register R1 = MRI.createGenericVirtualRegister(I32);
Register R2 = MRI.createGenericVirtualRegister(I32);
Register R3 = MRI.createGenericVirtualRegister(I32);
@@ -7589,7 +7604,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
- if (!MRI.getType(Index).isScalar(32))
+ if (!MRI.getType(Index).isInteger(32))
MI.getOperand(5).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
@@ -7597,7 +7612,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
- if (!MRI.getType(Index).isScalar(32))
+ if (!MRI.getType(Index).isInteger(32))
MI.getOperand(7).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
>From a434d575c6674c9d3b86e6a7c04dbabf1626f6a7 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 8 Jan 2025 14:43:26 +0000
Subject: [PATCH 11/12] add draft for InferTypeInfoPass
---
.../CodeGen/GlobalISel/InferTypeInfoPass.h | 43 +++
llvm/include/llvm/InitializePasses.h | 1 +
llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 +
llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp | 1 +
.../CodeGen/GlobalISel/InferTypeInfoPass.cpp | 298 ++++++++++++++++++
.../llvm/lib/CodeGen/GlobalISel/BUILD.gn | 1 +
6 files changed, 345 insertions(+)
create mode 100644 llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
create mode 100644 llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h b/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
new file mode 100644
index 00000000000000..7fbbe76c8ee6a8
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
@@ -0,0 +1,43 @@
+#ifndef LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
+#define LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class InferTypeInfo : public MachineFunctionPass {
+public:
+ static char ID;
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ MachineFunction *MF = nullptr;
+
+ MachineIRBuilder Builder;
+
+ /// Initialize the field members using \p MF.
+ void init(MachineFunction &MF);
+
+public:
+ InferTypeInfo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool inferTypeInfo(MachineFunction &MF);
+
+ bool shouldBeFP(MachineOperand &Op, unsigned Depth) const;
+
+ void updateDef(Register Reg);
+
+ void updateUse(MachineOperand &Op, bool FP);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 1cb9013bc48cc5..c07735551be317 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -131,6 +131,7 @@ void initializeHardwareLoopsLegacyPass(PassRegistry &);
void initializeMIRProfileLoaderPassPass(PassRegistry &);
void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
void initializeIRTranslatorPass(PassRegistry &);
+void initializeInferTypeInfoPass(PassRegistry &);
void initializeIVUsersWrapperPassPass(PassRegistry &);
void initializeIfConverterPass(PassRegistry &);
void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index a45024d120be68..627b629bb7846e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMGlobalISel
GIMatchTableExecutor.cpp
GISelChangeObserver.cpp
IRTranslator.cpp
+ InferTypeInfoPass.cpp
InlineAsmLowering.cpp
InstructionSelect.cpp
InstructionSelector.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
index efcc40641ea80c..b23b9499b4972d 100644
--- a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -16,6 +16,7 @@ using namespace llvm;
void llvm::initializeGlobalISel(PassRegistry &Registry) {
initializeIRTranslatorPass(Registry);
+ initializeInferTypeInfoPass(Registry);
initializeLegalizerPass(Registry);
initializeLoadStoreOptPass(Registry);
initializeLocalizerPass(Registry);
diff --git a/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp b/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
new file mode 100644
index 00000000000000..471f26d0e02804
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
@@ -0,0 +1,298 @@
+//===- llvm/CodeGen/GlobalISel/InferTypeInfoPass.cpp - StripTypeInfoPass ---*-
+// C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the InferTypeInfoPass class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InferTypeInfoPass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "mir-infer-type-info"
+
+using namespace llvm;
+
+char InferTypeInfo::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InferTypeInfo, DEBUG_TYPE, "TODO", false, false)
+INITIALIZE_PASS_END(InferTypeInfo, DEBUG_TYPE, "TODO", false, false)
+
+void InferTypeInfo::init(MachineFunction &MF) {
+ this->MF = &MF;
+ MRI = &MF.getRegInfo();
+ Builder.setMF(MF);
+}
+
+void InferTypeInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static LLT updateType(LLT Ty, bool FP) {
+ LLT InferredScalarTy =
+ FP ? LLT::floatingPoint(Ty.getScalarSizeInBits(), LLT::FPInfo::IEEE_FLOAT)
+ : LLT::integer(Ty.getScalarSizeInBits());
+ LLT InferredTy =
+ Ty.isVector() ? Ty.changeElementType(InferredScalarTy) : InferredScalarTy;
+
+ return InferredTy;
+}
+
+void InferTypeInfo::updateDef(Register Reg) {
+ LLT Ty = MRI->getType(Reg);
+ LLT InferredTy = updateType(Ty, false);
+
+ MRI->setType(Reg, InferredTy);
+}
+
+void InferTypeInfo::updateUse(MachineOperand &Op, bool FP) {
+ assert(Op.isReg());
+ LLT Ty = MRI->getType(Op.getReg());
+ LLT InferredTy = updateType(Ty, FP);
+
+ MachineOperand *Def = MRI->getOneDef(Op.getReg());
+ MachineInstr *MI = Op.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+
+ Builder.setInsertPt(*MBB, MI);
+ auto Bitcast = Builder.buildBitcast(InferredTy, Def->getReg());
+ Op.setReg(Bitcast.getReg(0));
+}
+
+constexpr unsigned MaxFPRSearchDepth = 5;
+
+bool InferTypeInfo::shouldBeFP(MachineOperand &Op, unsigned Depth = 0) const {
+ if (Depth > MaxFPRSearchDepth)
+ return false;
+
+ if (!Op.isReg())
+ return false;
+
+ MachineInstr &MI = *Op.getParent();
+
+ auto Pred = [&](MachineOperand &O) { return shouldBeFP(O, Depth + 1); };
+
+ // TODO: cache FP registers
+
+ switch (MI.getOpcode()) {
+ // def and use fp instructions
+ case TargetOpcode::G_FABS:
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FCANONICALIZE:
+ case TargetOpcode::G_FCEIL:
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FCOPYSIGN:
+ case TargetOpcode::G_FCOS:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_FEXP2:
+ case TargetOpcode::G_FEXP:
+ case TargetOpcode::G_FFLOOR:
+ case TargetOpcode::G_FLOG10:
+ case TargetOpcode::G_FLOG2:
+ case TargetOpcode::G_FLOG:
+ case TargetOpcode::G_FMA:
+ case TargetOpcode::G_FMAD:
+ case TargetOpcode::G_FMAXIMUM:
+ case TargetOpcode::G_FMAXNUM:
+ case TargetOpcode::G_FMAXNUM_IEEE:
+ case TargetOpcode::G_FMINIMUM:
+ case TargetOpcode::G_FMINNUM:
+ case TargetOpcode::G_FMINNUM_IEEE:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FNEARBYINT:
+ case TargetOpcode::G_FNEG:
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPOW:
+ case TargetOpcode::G_FPTRUNC:
+ case TargetOpcode::G_FREM:
+ case TargetOpcode::G_FRINT:
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FTAN:
+ case TargetOpcode::G_FACOS:
+ case TargetOpcode::G_FASIN:
+ case TargetOpcode::G_FATAN:
+ case TargetOpcode::G_FATAN2:
+ case TargetOpcode::G_FCOSH:
+ case TargetOpcode::G_FSINH:
+ case TargetOpcode::G_FTANH:
+ case TargetOpcode::G_FSQRT:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_INTRINSIC_ROUND:
+ case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ case TargetOpcode::G_VECREDUCE_FADD:
+ case TargetOpcode::G_VECREDUCE_FMUL:
+ case TargetOpcode::G_VECREDUCE_FMAX:
+ case TargetOpcode::G_VECREDUCE_FMIN:
+ case TargetOpcode::G_VECREDUCE_FMAXIMUM:
+ case TargetOpcode::G_VECREDUCE_FMINIMUM:
+ case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+ case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
+ return true;
+ // use only fp instructions
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ return Op.isDef();
+ // def only fp instructions
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT:
+ case TargetOpcode::G_FCMP:
+ case TargetOpcode::G_LROUND:
+ case TargetOpcode::G_LLROUND:
+ return Op.isUse();
+ case TargetOpcode::G_FREEZE:
+ case TargetOpcode::G_IMPLICIT_DEF:
+ case TargetOpcode::G_PHI:
+ case TargetOpcode::G_SELECT:
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_CONCAT_VECTORS:
+ case TargetOpcode::G_INSERT_SUBVECTOR:
+ case TargetOpcode::G_EXTRACT_SUBVECTOR:
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ case TargetOpcode::G_SPLAT_VECTOR:
+ case TargetOpcode::G_STEP_VECTOR:
+ case TargetOpcode::G_VECTOR_COMPRESS: {
+ return all_of(MI.all_defs(),
+ [&](MachineOperand &O) {
+ return all_of(MRI->use_operands(O.getReg()), Pred);
+ }) &&
+ all_of(MI.all_uses(), [&](MachineOperand &O) {
+ return all_of(MRI->def_operands(O.getReg()), Pred);
+ });
+ }
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &LHS = MI.getOperand(1);
+ MachineOperand &RHS = MI.getOperand(2);
+
+ return all_of(MRI->use_operands(Dst.getReg()), Pred) &&
+ (!LHS.isReg() || all_of(MRI->def_operands(LHS.getReg()), Pred)) &&
+ (!RHS.isReg() || all_of(MRI->def_operands(RHS.getReg()), Pred));
+ }
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_INDEXED_STORE: {
+ MachineOperand &Val = MI.getOperand(0);
+ return Op.getReg() == Val.getReg() && all_of(MRI->def_operands(Op.getReg()), Pred);
+ }
+ case TargetOpcode::G_INDEXED_LOAD:
+ case TargetOpcode::G_LOAD: {
+ MachineOperand &Dst = MI.getOperand(0);
+ return Op.getReg() == Dst.getReg() && all_of(MRI->use_operands(Dst.getReg()), Pred);
+ }
+ case TargetOpcode::G_ATOMICRMW_FADD:
+ case TargetOpcode::G_ATOMICRMW_FSUB:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN: {
+ MachineOperand &WriteBack = MI.getOperand(0);
+ MachineOperand &FPOp = MI.getOperand(2);
+ return Op.getReg() == WriteBack.getReg() || Op.getReg() == FPOp.getReg();
+ }
+ case TargetOpcode::G_INTRINSIC_CONVERGENT:
+ case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC: {
+ GIntrinsic *Intrinsic = dyn_cast<GIntrinsic>(&MI);
+ if (!Intrinsic)
+ return false;
+
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_sqrt:
+ case Intrinsic::amdgcn_fdot2_f16_f16:
+ case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool InferTypeInfo::inferTypeInfo(MachineFunction &MF) {
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB.instrs()) {
+
+ for (auto &Def : MI.all_defs()) {
+ if (shouldBeFP(Def)) {
+ updateDef(Def.getReg());
+ Changed |= true;
+ }
+ }
+
+ for (auto &Use : MI.all_uses()) {
+ bool IsFPDef =
+ MRI->getVRegDef(Use.getReg()) &&
+ all_of(MRI->def_operands(Use.getReg()),
+ [&](MachineOperand &Op) { return shouldBeFP(Op); });
+ bool IsFPUse = shouldBeFP(Use);
+
+ if (IsFPUse && !IsFPDef) {
+ updateUse(Use, true);
+ Changed |= true;
+ } else if (!IsFPUse && IsFPDef) {
+ updateUse(Use, false);
+ Changed |= true;
+ }
+ }
+
+ for (auto &MemOp: MI.memoperands()) {
+ bool IsFP = any_of(MI.all_defs(), [&](MachineOperand &O){ return shouldBeFP(O); }) ||
+ any_of(MI.all_uses(), [&](MachineOperand &O){ return shouldBeFP(O); });
+
+ if (!IsFP)
+ continue;
+
+ LLT Ty = MemOp->getType();
+ LLT NewTy = updateType(Ty, true);
+ MemOp->setType(NewTy);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool InferTypeInfo::runOnMachineFunction(MachineFunction &MF) {
+ init(MF);
+ bool Changed = false;
+ Changed |= inferTypeInfo(MF);
+ return Changed;
+}
\ No newline at end of file
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index dc9e449195159a..37d1cf7e93aeaf 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -26,6 +26,7 @@ static_library("GlobalISel") {
"GISelKnownBits.cpp",
"GlobalISel.cpp",
"IRTranslator.cpp",
+ "InferTypeInfoPass.cpp",
"InlineAsmLowering.cpp",
"InstructionSelect.cpp",
"InstructionSelector.cpp",
>From 9c8056668c78aa31b103b742cd439639ff61034a Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Fri, 10 Jan 2025 16:32:17 +0000
Subject: [PATCH 12/12] patch MIR parser
---
llvm/lib/CodeGen/MIRParser/MIParser.cpp | 51 ++++++++++++++++++++-----
1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index f77c4613ad801b..c73f5230a7645a 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
@@ -1923,13 +1924,19 @@ static bool verifyAddrSpace(uint64_t AddrSpace) {
}
bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
- if (Token.range().front() == 's' || Token.range().front() == 'p') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i' || Token.range().front() == 'f' || Token.range().front() == 'p') {
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 's'/'p' type character");
+ return error("expected integers after 's'/'i'/'f'/'p' type character");
+ }
+
+ if (Token.range().substr(0,2) == "bf") {
+ StringRef SizeStr = Token.range().drop_front(2);
+ if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+ return error("expected integers after 'bf' type string");
}
- if (Token.range().front() == 's') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i') {
auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
if (ScalarSize) {
if (!verifyScalarSize(ScalarSize))
@@ -1949,6 +1956,20 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
lex();
return false;
+ } else if (Token.range().front() == 'f') {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!ScalarSize || !verifyScalarSize(ScalarSize))
+ return error("invalid size for scalar type");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::IEEE_FLOAT);
+ lex();
+ return false;
+ } else if (Token.range().substr(0, 2) == "bf") {
+ auto ScalarSize = APSInt(Token.range().drop_front(2)).getZExtValue();
+ if (!ScalarSize || !verifyScalarSize(ScalarSize))
+ return error("invalid size for scalar type");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::VARIANT_FLOAT_1);
+ lex();
+ return false;
}
// Now we're looking for a vector.
@@ -1985,14 +2006,16 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
return GetError();
lex();
- if (Token.range().front() != 's' && Token.range().front() != 'p')
+ if (Token.range().front() != 's' && Token.range().front() != 'i' &&
+ Token.range().front() != 'f' && Token.range().front() != 'p' &&
+ Token.range().substr(0, 2) != "bf")
return GetError();
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 's'/'p' type character");
+ return error("expected integers after 's'/'i'/'f'/'p' type character");
- if (Token.range().front() == 's') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i') {
auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
if (!verifyScalarSize(ScalarSize))
return error("invalid size for scalar element in vector");
@@ -2004,6 +2027,16 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
return error("invalid address space number");
Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+ } else if (Token.range().front() == 'f') {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!verifyScalarSize(ScalarSize))
+ return error("invalid size for float element in vector");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::IEEE_FLOAT);
+ } else if (Token.range().substr(0, 2) == "bf") {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!verifyScalarSize(ScalarSize))
+ return error("invalid size for bfloat element in vector");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::VARIANT_FLOAT_1);
} else
return GetError();
lex();
@@ -2021,12 +2054,12 @@ bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
assert(Token.is(MIToken::Identifier));
StringRef TypeStr = Token.range();
if (TypeStr.front() != 'i' && TypeStr.front() != 's' &&
- TypeStr.front() != 'p')
+ TypeStr.front() != 'p' && TypeStr.front() != 'f' && TypeStr.substr(0,2) != "bf")
return error(
- "a typed immediate operand should start with one of 'i', 's', or 'p'");
+ "a typed immediate operand should start with one of 'i', 's','f','bf', or 'p'");
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 'i'/'s'/'p' type character");
+ return error("expected integers after 'i'/'s'/'f'/'bf'/'p' type character");
auto Loc = Token.location();
lex();
More information about the llvm-commits
mailing list