[llvm] [GlobalISel][AMDGPU] Implement FPInfo for LLT (PR #122503)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 14 06:42:18 PST 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/122503
>From a558e3dbba7109aa62d60336809d27fcb518914b Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Fri, 4 Oct 2024 09:20:06 +0000
Subject: [PATCH 01/11] apply llt-float.patch
---
llvm/include/llvm/CodeGen/LowLevelTypeUtils.h | 8 +-
llvm/include/llvm/CodeGenTypes/LowLevelType.h | 346 +++++++++++++-----
llvm/lib/CodeGen/LowLevelTypeUtils.cpp | 81 +++-
llvm/lib/CodeGenTypes/LowLevelType.cpp | 75 +++-
4 files changed, 375 insertions(+), 135 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h b/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
index 142e5cd4e7ad17..af0e436e21166f 100644
--- a/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
+++ b/llvm/include/llvm/CodeGen/LowLevelTypeUtils.h
@@ -26,20 +26,20 @@ class Type;
struct fltSemantics;
/// Construct a low-level type based on an LLVM type.
-LLT getLLTForType(Type &Ty, const DataLayout &DL);
+LLT getLLTForType(Type &Ty, const DataLayout &DL, bool EnableFPInfo = false);
/// Get a rough equivalent of an MVT for a given LLT. MVT can't distinguish
/// pointers, so these will convert to a plain integer.
-MVT getMVTForLLT(LLT Ty);
+MVT getMVTForLLT(LLT Ty, bool EnableFPInfo = false);
EVT getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx);
/// Get a rough equivalent of an LLT for a given MVT. LLT does not yet support
/// scalarable vector types, and will assert if used.
-LLT getLLTForMVT(MVT Ty);
+LLT getLLTForMVT(MVT Ty, bool EnableFPInfo = false);
/// Get the appropriate floating point arithmetic semantic based on the bit size
/// of the given scalar LLT.
const llvm::fltSemantics &getFltSemanticForLLT(LLT Ty);
-}
+} // namespace llvm
#endif // LLVM_CODEGEN_LOWLEVELTYPEUTILS_H
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 06879e1f8d15b0..cf5f740c364d39 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -28,78 +28,144 @@
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <cassert>
namespace llvm {
+extern cl::opt<bool> EnableFPInfo;
+
class Type;
class raw_ostream;
class LLT {
public:
+ enum class FPInfo {
+ IEEE_FLOAT = 0x0,
+ VARIANT_FLOAT_1 = 0x1,
+ VARIANT_FLOAT_2 = 0x2,
+ VARIANT_FLOAT_3 = 0x3,
+ };
+
+ enum class Kind : uint64_t {
+ INVALID = 0b000,
+ INTEGER = 0b001,
+ FLOAT = 0b010,
+ POINTER = 0b011,
+ VECTOR_INTEGER = 0b101,
+ VECTOR_FLOAT = 0b110,
+ VECTOR_POINTER = 0b111,
+ };
+
+ constexpr static Kind toVector(Kind Ty) {
+ if (Ty == Kind::POINTER)
+ return Kind::VECTOR_POINTER;
+
+ if (Ty == Kind::INTEGER)
+ return Kind::VECTOR_INTEGER;
+
+ if (Ty == Kind::FLOAT)
+ return Kind::VECTOR_FLOAT;
+
+ assert(false && "Type is already a vector type");
+ return Ty;
+ }
+
+ constexpr static Kind toScalar(Kind Ty) {
+ if (Ty == Kind::VECTOR_POINTER)
+ return Kind::POINTER;
+
+ if (Ty == Kind::VECTOR_INTEGER)
+ return Kind::INTEGER;
+
+ if (Ty == Kind::VECTOR_FLOAT)
+ return Kind::FLOAT;
+
+ assert(false && "Type is already a scalar type");
+ return Ty;
+ }
+
/// Get a low-level scalar or aggregate "bag of bits".
+ [[deprecated("Use LLT::integer(unsigned) instead.")]]
static constexpr LLT scalar(unsigned SizeInBits) {
- return LLT{/*isPointer=*/false, /*isVector=*/false, /*isScalar=*/true,
- ElementCount::getFixed(0), SizeInBits,
- /*AddressSpace=*/0};
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
+ }
+
+ static constexpr LLT integer(unsigned SizeInBits) {
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
+ }
+
+ static constexpr LLT floatingPoint(unsigned SizeInBits, FPInfo FP) {
+ return LLT{Kind::FLOAT, ElementCount::getFixed(0), SizeInBits,
+ /*AddressSpace=*/0, FP};
}
/// Get a low-level token; just a scalar with zero bits (or no size).
static constexpr LLT token() {
- return LLT{/*isPointer=*/false, /*isVector=*/false,
- /*isScalar=*/true, ElementCount::getFixed(0),
+ return LLT{Kind::INTEGER, ElementCount::getFixed(0),
/*SizeInBits=*/0,
- /*AddressSpace=*/0};
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
}
/// Get a low-level pointer in the given address space.
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
assert(SizeInBits > 0 && "invalid pointer size");
- return LLT{/*isPointer=*/true, /*isVector=*/false, /*isScalar=*/false,
- ElementCount::getFixed(0), SizeInBits, AddressSpace};
+ return LLT{Kind::POINTER, ElementCount::getFixed(0), SizeInBits,
+ AddressSpace, static_cast<FPInfo>(0)};
}
/// Get a low-level vector of some number of elements and element width.
+ [[deprecated("Use LLT::vector(EC, LLT) instead.")]]
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits) {
assert(!EC.isScalar() && "invalid number of vector elements");
- return LLT{/*isPointer=*/false, /*isVector=*/true, /*isScalar=*/false,
- EC, ScalarSizeInBits, /*AddressSpace=*/0};
+ return LLT{Kind::VECTOR_INTEGER, EC, ScalarSizeInBits,
+ /*AddressSpace=*/0, static_cast<FPInfo>(0)};
}
/// Get a low-level vector of some number of elements and element type.
static constexpr LLT vector(ElementCount EC, LLT ScalarTy) {
assert(!EC.isScalar() && "invalid number of vector elements");
assert(!ScalarTy.isVector() && "invalid vector element type");
- return LLT{ScalarTy.isPointer(),
- /*isVector=*/true,
- /*isScalar=*/false,
- EC,
- ScalarTy.getSizeInBits().getFixedValue(),
- ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
+
+ Kind Info = toVector(ScalarTy.Info);
+ return LLT{Info, EC, ScalarTy.getSizeInBits().getFixedValue(),
+ ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0,
+ ScalarTy.isFloat() ? ScalarTy.getFPInfo()
+ : static_cast<FPInfo>(0)};
}
+ // Get a 16-bit brain float value.
+ static constexpr LLT bfloat() { return integer(16); }
+
/// Get a 16-bit IEEE half value.
- /// TODO: Add IEEE semantics to type - This currently returns a simple `scalar(16)`.
- static constexpr LLT float16() {
- return scalar(16);
- }
+ static constexpr LLT float16() { return integer(16); }
/// Get a 32-bit IEEE float value.
- static constexpr LLT float32() {
- return scalar(32);
- }
+ static constexpr LLT float32() { return integer(32); }
/// Get a 64-bit IEEE double value.
- static constexpr LLT float64() {
- return scalar(64);
- }
+ static constexpr LLT float64() { return integer(64); }
+
+ /// Get a 80-bit X86 floating point value.
+ static constexpr LLT x86fp80() { return integer(80); }
+
+ /// Get a 128-bit IEEE quad value.
+ static constexpr LLT float128() { return floatingPoint(128, FPInfo::IEEE_FLOAT); }
+
+ /// Get a 128-bit PowerPC double double value.
+ static constexpr LLT ppcf128() { return integer(128); }
/// Get a low-level fixed-width vector of some number of elements and element
/// width.
+ [[deprecated("Use LLT::fixed_vector(unsigned, LLT) instead.")]]
static constexpr LLT fixed_vector(unsigned NumElements,
unsigned ScalarSizeInBits) {
- return vector(ElementCount::getFixed(NumElements), ScalarSizeInBits);
+ return vector(ElementCount::getFixed(NumElements),
+ LLT::integer(ScalarSizeInBits));
}
/// Get a low-level fixed-width vector of some number of elements and element
@@ -110,9 +176,11 @@ class LLT {
/// Get a low-level scalable vector of some number of elements and element
/// width.
+ [[deprecated("Use LLT::scalable_vector(unsigned, LLT) instead.")]]
static constexpr LLT scalable_vector(unsigned MinNumElements,
unsigned ScalarSizeInBits) {
- return vector(ElementCount::getScalable(MinNumElements), ScalarSizeInBits);
+ return vector(ElementCount::getScalable(MinNumElements),
+ LLT::integer(ScalarSizeInBits));
}
/// Get a low-level scalable vector of some number of elements and element
@@ -125,33 +193,77 @@ class LLT {
return EC.isScalar() ? ScalarTy : LLT::vector(EC, ScalarTy);
}
+ [[deprecated("Use LLT::scalarOrVector(EC, LLT) instead.")]]
static constexpr LLT scalarOrVector(ElementCount EC, uint64_t ScalarSize) {
assert(ScalarSize <= std::numeric_limits<unsigned>::max() &&
"Not enough bits in LLT to represent size");
- return scalarOrVector(EC, LLT::scalar(static_cast<unsigned>(ScalarSize)));
+ return scalarOrVector(EC, LLT::integer(static_cast<unsigned>(ScalarSize)));
}
- explicit constexpr LLT(bool isPointer, bool isVector, bool isScalar,
- ElementCount EC, uint64_t SizeInBits,
- unsigned AddressSpace)
+ explicit constexpr LLT(Kind Info, ElementCount EC, uint64_t SizeInBits,
+ unsigned AddressSpace, FPInfo FP)
: LLT() {
- init(isPointer, isVector, isScalar, EC, SizeInBits, AddressSpace);
+ init(Info, EC, SizeInBits, AddressSpace, FP);
}
- explicit constexpr LLT()
- : IsScalar(false), IsPointer(false), IsVector(false), RawData(0) {}
- explicit LLT(MVT VT);
+ explicit LLT(MVT VT, bool EnableFPInfo = false);
+ explicit constexpr LLT() : Info(static_cast<Kind>(0)), RawData(0) {}
- constexpr bool isValid() const { return IsScalar || RawData != 0; }
- constexpr bool isScalar() const { return IsScalar; }
- constexpr bool isToken() const { return IsScalar && RawData == 0; };
- constexpr bool isVector() const { return isValid() && IsVector; }
+ constexpr bool isValid() const {
+ return isToken() || RawData != 0;
+ }
+ constexpr bool isScalar() const {
+ return Info == Kind::INTEGER || Info == Kind::FLOAT;
+ }
+ constexpr bool isScalar(unsigned Size) const {
+ return isScalar() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isFloat() const { return isValid() && Info == Kind::FLOAT; }
+ constexpr bool isFloat(unsigned Size) const {
+ return isFloat() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isVariantFloat() const {
+ return isFloat() && (getFPInfo() == FPInfo::VARIANT_FLOAT_1 ||
+ getFPInfo() == FPInfo::VARIANT_FLOAT_2 ||
+ getFPInfo() == FPInfo::VARIANT_FLOAT_3);
+ }
+ constexpr bool isVariantFloat(FPInfo Variant) const {
+ return isFloat() && getFPInfo() == Variant;
+ }
+ constexpr bool isVariantFloat(unsigned Size, FPInfo Variant) const {
+ return isVariantFloat() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isFloatVector() const {
+ return isVector() && Info == Kind::VECTOR_FLOAT;
+ }
+ constexpr bool isBFloat() const { return isVariantFloat(16, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isX86FP80() const { return isVariantFloat(80, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isPPCF128() const { return isVariantFloat(128, FPInfo::VARIANT_FLOAT_1); }
+ constexpr bool isToken() const {
+ return Info == Kind::INTEGER && RawData == 0;
+ }
+ constexpr bool isInteger() const {
+ return isValid() && Info == Kind::INTEGER;
+ }
+ constexpr bool isInteger(unsigned Size) const {
+ return isInteger() && getScalarSizeInBits() == Size;
+ }
+ constexpr bool isIntegerVector() const {
+ return isVector() && Info == Kind::VECTOR_INTEGER;
+ }
+ constexpr bool isVector() const {
+ return isValid() &&
+ (Info == Kind::VECTOR_INTEGER || Info == Kind::VECTOR_FLOAT ||
+ Info == Kind::VECTOR_POINTER);
+ }
constexpr bool isPointer() const {
- return isValid() && IsPointer && !IsVector;
+ return isValid() && Info == Kind::POINTER;
+ }
+ constexpr bool isPointerVector() const {
+ return isVector() && Info == Kind::VECTOR_POINTER;
}
- constexpr bool isPointerVector() const { return IsPointer && isVector(); }
constexpr bool isPointerOrPointerVector() const {
- return IsPointer && isValid();
+ return isPointer() || isPointerVector();
}
/// Returns the number of elements in a vector LLT. Must only be called on
@@ -176,12 +288,18 @@ class LLT {
/// if the LLT is not a vector type.
constexpr bool isFixedVector() const { return isVector() && !isScalable(); }
+ constexpr bool isFixedVector(unsigned NumElements,
+ unsigned ScalarSize) const {
+ return isFixedVector() && getNumElements() == NumElements &&
+ getScalarSizeInBits() == ScalarSize;
+ }
+
/// Returns true if the LLT is a scalable vector. Returns false otherwise,
/// even if the LLT is not a vector type.
constexpr bool isScalableVector() const { return isVector() && isScalable(); }
constexpr ElementCount getElementCount() const {
- assert(IsVector && "cannot get number of elements on scalar/aggregate");
+ assert(isVector() && "cannot get number of elements on scalar/aggregate");
return ElementCount::get(getFieldValue(VectorElementsFieldInfo),
isScalable());
}
@@ -206,6 +324,13 @@ class LLT {
return isVector() ? getElementType() : *this;
}
+ constexpr FPInfo getFPInfo() const {
+ assert((isFloat() || isFloatVector()) &&
+ "cannot get FP info for non float type");
+
+ return FPInfo(getFieldValue(ScalarFPFieldInfo));
+ }
+
/// If this type is a vector, return a vector with the same number of elements
/// but the new element type. Otherwise, return the new element type.
constexpr LLT changeElementType(LLT NewEltTy) const {
@@ -216,10 +341,10 @@ class LLT {
/// but the new element size. Otherwise, return the new element type. Invalid
/// for pointer types. For pointer types, use changeElementType.
constexpr LLT changeElementSize(unsigned NewEltSize) const {
- assert(!isPointerOrPointerVector() &&
+ assert(!isPointerOrPointerVector() && !(isFloat() || isFloatVector()) &&
"invalid to directly change element size for pointers");
- return isVector() ? LLT::vector(getElementCount(), NewEltSize)
- : LLT::scalar(NewEltSize);
+ return isVector() ? LLT::vector(getElementCount(), LLT::integer(NewEltSize))
+ : LLT::integer(NewEltSize);
}
/// Return a vector or scalar with the same element type and the new element
@@ -228,6 +353,20 @@ class LLT {
return LLT::scalarOrVector(EC, getScalarType());
}
+ constexpr LLT changeElementCount(unsigned NumElements) const {
+ return changeElementCount(ElementCount::getFixed(NumElements));
+ }
+
+ constexpr LLT changeFPInfo(FPInfo FP) const {
+ assert(isFloat() ||
+ isFloatVector() &&
+ "cannot change FPInfo for non floating point types");
+ if (isFloatVector())
+ LLT::vector(getElementCount(), getElementType().changeFPInfo(FP));
+
+ return LLT::floatingPoint(getSizeInBits(), FP);
+ }
+
/// Return a type that is \p Factor times smaller. Reduces the number of
/// elements if this is a vector, or the bitwidth for scalar/pointers. Does
/// not attempt to handle cases that aren't evenly divisible.
@@ -242,7 +381,7 @@ class LLT {
}
assert(getScalarSizeInBits() % Factor == 0);
- return scalar(getScalarSizeInBits() / Factor);
+ return integer(getScalarSizeInBits() / Factor);
}
/// Produce a vector type that is \p Factor times bigger, preserving the
@@ -276,10 +415,23 @@ class LLT {
/// Returns the vector's element type. Only valid for vector types.
constexpr LLT getElementType() const {
assert(isVector() && "cannot get element type of scalar/aggregate");
- if (IsPointer)
+ if (isPointerVector())
return pointer(getAddressSpace(), getScalarSizeInBits());
- else
- return scalar(getScalarSizeInBits());
+
+ if (isFloatVector())
+ return floatingPoint(getScalarSizeInBits(), getFPInfo());
+
+ return integer(getScalarSizeInBits());
+ }
+
+ constexpr LLT dropType() const {
+ if (isPointer() || isPointerVector())
+ return *this;
+
+ if (isVector())
+ return vector(getElementCount(), LLT::integer(getScalarSizeInBits()));
+
+ return integer(getSizeInBits());
}
void print(raw_ostream &OS) const;
@@ -289,8 +441,7 @@ class LLT {
#endif
constexpr bool operator==(const LLT &RHS) const {
- return IsPointer == RHS.IsPointer && IsVector == RHS.IsVector &&
- IsScalar == RHS.IsScalar && RHS.RawData == RawData;
+ return Info == RHS.Info && RawData == RHS.RawData;
}
constexpr bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
@@ -300,37 +451,33 @@ class LLT {
private:
/// LLT is packed into 64 bits as follows:
- /// isScalar : 1
- /// isPointer : 1
- /// isVector : 1
- /// with 61 bits remaining for Kind-specific data, packed in bitfields
- /// as described below. As there isn't a simple portable way to pack bits
- /// into bitfields, here the different fields in the packed structure is
+ /// Info : 3
+ /// RawData : 61
+ /// with 61 bits of RawData remaining for Kind-specific data, packed in
+ /// bitfields as described below. As there isn't a simple portable way to pack
+ /// bits into bitfields, here the different fields in the packed structure is
/// described in static const *Field variables. Each of these variables
/// is a 2-element array, with the first element describing the bitfield size
/// and the second element describing the bitfield offset.
///
- /// +--------+---------+--------+----------+----------------------+
- /// |isScalar|isPointer|isVector| RawData |Notes |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 0 | 0 |Invalid |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 1 | 0 |Tombstone Key |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 0 | 0 |Empty Key |
- /// +--------+---------+--------+----------+----------------------+
- /// | 1 | 0 | 0 | 0 |Token |
- /// +--------+---------+--------+----------+----------------------+
- /// | 1 | 0 | 0 | non-zero |Scalar |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 0 | non-zero |Pointer |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 0 | 1 | non-zero |Vector of non-pointer |
- /// +--------+---------+--------+----------+----------------------+
- /// | 0 | 1 | 1 | non-zero |Vector of pointer |
- /// +--------+---------+--------+----------+----------------------+
- ///
- /// Everything else is reserved.
+ /*
+ --- LLT ---
+
+ 63 56 47 39 31 23 15 7 0
+ | | | | | | | | |
+ |xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|xxxxxxxx|
+ ................................... (1)
+ ***************** (2)
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~ (3)
+ ^^^^^^^^^^^^^^^^^ (4)
+ @ (5)
+ ### (6)
+ %%% (7)
+
+ (1) ScalarSize (2) PointerSize (3) PointerAddressSpace
+ (4) VectorElements (5) VectorScalable (6) FPInfo (7) Kind
+
+ */
typedef int BitFieldInfo[2];
///
/// This is how the bitfields are packed per Kind:
@@ -340,6 +487,7 @@ class LLT {
/// * Non-pointer scalar (isPointer == 0 && isVector == 0):
/// SizeInBits: 32;
static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
+ static const constexpr BitFieldInfo ScalarFPFieldInfo{2, 21};
/// * Pointer (isPointer == 1 && isVector == 0):
/// SizeInBits: 16;
/// AddressSpace: 24;
@@ -357,20 +505,20 @@ class LLT {
/// AddressSpace: 24;
/// Scalable: 1;
- uint64_t IsScalar : 1;
- uint64_t IsPointer : 1;
- uint64_t IsVector : 1;
+ Kind Info : 3;
uint64_t RawData : 61;
static constexpr uint64_t getMask(const BitFieldInfo FieldInfo) {
const int FieldSizeInBits = FieldInfo[0];
return (((uint64_t)1) << FieldSizeInBits) - 1;
}
+
static constexpr uint64_t maskAndShift(uint64_t Val, uint64_t Mask,
uint8_t Shift) {
assert(Val <= Mask && "Value too large for field");
return (Val & Mask) << Shift;
}
+
static constexpr uint64_t maskAndShift(uint64_t Val,
const BitFieldInfo FieldInfo) {
return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
@@ -380,21 +528,20 @@ class LLT {
return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
}
- constexpr void init(bool IsPointer, bool IsVector, bool IsScalar,
- ElementCount EC, uint64_t SizeInBits,
- unsigned AddressSpace) {
+ constexpr void init(Kind Info, ElementCount EC, uint64_t SizeInBits,
+ unsigned AddressSpace, FPInfo FP) {
assert(SizeInBits <= std::numeric_limits<unsigned>::max() &&
"Not enough bits in LLT to represent size");
- this->IsPointer = IsPointer;
- this->IsVector = IsVector;
- this->IsScalar = IsScalar;
- if (IsPointer) {
+ this->Info = Info;
+ if (Info == Kind::POINTER || Info == Kind::VECTOR_POINTER) {
RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
} else {
- RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
+ RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo) |
+ maskAndShift((uint64_t) FP, ScalarFPFieldInfo);
}
- if (IsVector) {
+
+ if (Info == Kind::VECTOR_INTEGER || Info == Kind::VECTOR_FLOAT || Info == Kind::VECTOR_POINTER) {
RawData |= maskAndShift(EC.getKnownMinValue(), VectorElementsFieldInfo) |
maskAndShift(EC.isScalable() ? 1 : 0, VectorScalableFieldInfo);
}
@@ -402,25 +549,24 @@ class LLT {
public:
constexpr uint64_t getUniqueRAWLLTData() const {
- return ((uint64_t)RawData) << 3 | ((uint64_t)IsScalar) << 2 |
- ((uint64_t)IsPointer) << 1 | ((uint64_t)IsVector);
+ return ((uint64_t)RawData) << 3 | ((uint64_t)Info);
}
};
-inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
+inline raw_ostream &operator<<(raw_ostream &OS, const LLT &Ty) {
Ty.print(OS);
return OS;
}
-template<> struct DenseMapInfo<LLT> {
+template <> struct DenseMapInfo<LLT> {
static inline LLT getEmptyKey() {
LLT Invalid;
- Invalid.IsPointer = true;
+ Invalid.Info = static_cast<LLT::Kind>(2);
return Invalid;
}
static inline LLT getTombstoneKey() {
LLT Invalid;
- Invalid.IsVector = true;
+ Invalid.Info = static_cast<LLT::Kind>(3);
return Invalid;
}
static inline unsigned getHashValue(const LLT &Ty) {
@@ -428,10 +574,10 @@ template<> struct DenseMapInfo<LLT> {
return DenseMapInfo<uint64_t>::getHashValue(Val);
}
static bool isEqual(const LLT &LHS, const LLT &RHS) {
- return LHS == RHS;
+ return LHS.getUniqueRAWLLTData() == RHS.getUniqueRAWLLTData();
}
};
-}
+} // namespace llvm
#endif // LLVM_CODEGEN_LOWLEVELTYPE_H
diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
index 936c9fbb2fff02..f229d954db3f2b 100644
--- a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
+++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
@@ -15,12 +15,13 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
-LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL, bool EnableFPInfo) {
if (auto VTy = dyn_cast<VectorType>(&Ty)) {
auto EC = VTy->getElementCount();
- LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL);
+ LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL, EnableFPInfo);
if (EC.isScalar())
return ScalarTy;
return LLT::vector(EC, ScalarTy);
@@ -36,7 +37,37 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
// concerned.
auto SizeInBits = DL.getTypeSizeInBits(&Ty);
assert(SizeInBits != 0 && "invalid zero-sized type");
- return LLT::scalar(SizeInBits);
+
+ if (Ty.isFloatingPointTy()) {
+ if (Ty.isHalfTy())
+ return LLT::float16();
+
+ if (Ty.isBFloatTy())
+ return LLT::bfloat();
+
+ if (Ty.isFloatTy())
+ return LLT::float32();
+
+ if (Ty.isDoubleTy())
+ return LLT::float64();
+
+ if (Ty.isX86_FP80Ty())
+ return LLT::x86fp80();
+
+ if (Ty.isFP128Ty())
+ return LLT::float128();
+
+ if (Ty.isPPC_FP128Ty())
+ return LLT::ppcf128();
+
+ llvm_unreachable("Unhandled LLVM IR floating point type");
+ }
+
+ if (Ty.isIntegerTy()) {
+ return LLT::integer(SizeInBits);
+ }
+
+ return LLT::integer(SizeInBits);
}
if (Ty.isTokenTy())
@@ -45,13 +76,26 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
return LLT();
}
-MVT llvm::getMVTForLLT(LLT Ty) {
- if (!Ty.isVector())
- return MVT::getIntegerVT(Ty.getSizeInBits());
+MVT llvm::getMVTForLLT(LLT Ty, bool EnableFPInfo) {
+ if (Ty.isVector()) {
+ return MVT::getVectorVT(getMVTForLLT(Ty.getElementType()), Ty.getElementCount());
+ }
+
+ if (Ty.isFloat()) {
+ if (Ty == LLT::bfloat())
+ return MVT::bf16;
+
+ if (Ty == LLT::x86fp80())
+ return MVT::f80;
+
+ if (Ty == LLT::ppcf128())
+ return MVT::ppcf128;
+
+ return MVT::getFloatingPointVT(Ty.getSizeInBits());
+ }
- return MVT::getVectorVT(
- MVT::getIntegerVT(Ty.getElementType().getSizeInBits()),
- Ty.getElementCount());
+
+ return MVT::getIntegerVT(Ty.getSizeInBits());
}
EVT llvm::getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx) {
@@ -63,16 +107,29 @@ EVT llvm::getApproximateEVTForLLT(LLT Ty, LLVMContext &Ctx) {
return EVT::getIntegerVT(Ctx, Ty.getSizeInBits());
}
-LLT llvm::getLLTForMVT(MVT Ty) {
+LLT llvm::getLLTForMVT(MVT Ty, bool EnableFPInfo) {
+ if (EnableFPInfo)
+ return LLT(Ty);
+
if (!Ty.isVector())
- return LLT::scalar(Ty.getSizeInBits());
+ return LLT::integer(Ty.getSizeInBits());
return LLT::scalarOrVector(Ty.getVectorElementCount(),
- Ty.getVectorElementType().getSizeInBits());
+ LLT::integer(Ty.getVectorElementType().getSizeInBits()));
}
const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) {
assert(Ty.isScalar() && "Expected a scalar type.");
+
+ if (Ty.isBFloat())
+ return APFloat::BFloat();
+ if (Ty.isX86FP80())
+ return APFloat::x87DoubleExtended();
+ if (Ty.isPPCF128())
+ return APFloat::PPCDoubleDouble();
+
+ assert(!Ty.isVariantFloat() && "Unhandled variant float type");
+
switch (Ty.getSizeInBits()) {
case 16:
return APFloat::IEEEhalf();
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f2652b00e8..947b22de67cff1 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -16,36 +16,72 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-LLT::LLT(MVT VT) {
- if (VT.isVector()) {
- bool asVector = VT.getVectorMinNumElements() > 1 || VT.isScalableVector();
- init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector,
- VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(),
- /*AddressSpace=*/0);
- } else if (VT.isValid() && !VT.isScalableTargetExtVT()) {
- // Aggregates are no different from real scalars as far as GlobalISel is
- // concerned.
- init(/*IsPointer=*/false, /*IsVector=*/false, /*IsScalar=*/true,
- ElementCount::getFixed(0), VT.getSizeInBits(), /*AddressSpace=*/0);
- } else {
- IsScalar = false;
- IsPointer = false;
- IsVector = false;
- RawData = 0;
+cl::opt<bool> llvm::EnableFPInfo(
+ "enable-fpinfo",
+ cl::desc("Enable low level types to carry floating point information"),
+ cl::Optional, cl::Hidden, cl::init(false));
+
+static std::optional<LLT::FPInfo> deriveFPInfo(MVT VT) {
+ if (!VT.isFloatingPoint())
+ return std::nullopt;
+
+ switch (VT.getScalarType().SimpleTy) {
+ case MVT::bf16:
+ case MVT::ppcf128:
+ return LLT::FPInfo::VARIANT_FLOAT_1;
+ default:
+ return LLT::FPInfo::IEEE_FLOAT;
}
}
+LLT::LLT(MVT VT, bool EnableFPInfo) {
+ auto FP = EnableFPInfo ? deriveFPInfo(VT) : std::nullopt;
+ bool AsVector = VT.isVector() && (VT.getVectorMinNumElements() > 1 || VT.isScalableVector());
+
+ Kind Info;
+ if (EnableFPInfo && FP.has_value())
+ Info = AsVector ? Kind::VECTOR_FLOAT : Kind::FLOAT;
+ else
+ Info = AsVector ? Kind::VECTOR_INTEGER : Kind::INTEGER;
+
+ if (VT.isVector()) {
+ init(Info,
+ VT.getVectorElementCount(),
+ VT.getVectorElementType().getSizeInBits(),
+ /*AddressSpace=*/0, FP.value_or(FPInfo::IEEE_FLOAT));
+ } else if (VT.isValid() && !VT.isScalableTargetExtVT()) {
+ // Aggregates are no different from real scalars as far as GlobalISel is
+ // concerned.
+ init(Info, ElementCount::getFixed(0), VT.getSizeInBits(),
+ /*AddressSpace=*/0, FP.value_or(FPInfo::IEEE_FLOAT));
+ } else {
+ this->Info = static_cast<Kind>(0);
+ this->RawData = 0;
+ }
+ }
+
void LLT::print(raw_ostream &OS) const {
+ constexpr bool EnableFPInfo = false;
if (isVector()) {
OS << "<";
OS << getElementCount() << " x " << getElementType() << ">";
- } else if (isPointer())
+ } else if (isPointer()) {
OS << "p" << getAddressSpace();
- else if (isValid()) {
+ } else if (EnableFPInfo && isBFloat()) {
+ OS << "bf16";
+ } else if (EnableFPInfo && isPPCF128()) {
+ OS << "ppcf128";
+ } else if (EnableFPInfo && isFloat()) {
+ assert(!isVariantFloat() && "unknown float variant");
+ OS << "f" << getScalarSizeInBits();
+ } else if (EnableFPInfo && isInteger()) {
+ OS << "i" << getScalarSizeInBits();
+ } else if (isValid()) {
assert(isScalar() && "unexpected type");
OS << "s" << getScalarSizeInBits();
- } else
+ } else {
OS << "LLT_invalid";
+ }
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -56,6 +92,7 @@ LLVM_DUMP_METHOD void LLT::dump() const {
#endif
const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::ScalarFPFieldInfo;
const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
>From 2831fe8dd4da2facf39f1fce1202deb86ab713f1 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Mon, 11 Nov 2024 16:33:31 +0100
Subject: [PATCH 02/11] add FP LLTs to TableGen
---
.../GlobalISel/GlobalISelMatchTable.cpp | 101 +++++++++++-------
1 file changed, 62 insertions(+), 39 deletions(-)
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index a81f2b53f2846e..fe2757ac66a3c6 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -349,42 +349,95 @@ std::string LLTCodeGen::getCxxEnumValue() const {
void LLTCodeGen::emitCxxEnumValue(raw_ostream &OS) const {
if (Ty.isScalar()) {
- OS << "GILLT_s" << Ty.getSizeInBits();
+ if (Ty.isBFloat())
+ OS << "GILLT_bf16";
+ else if (Ty.isPPCF128())
+ OS << "GILLT_ppcf128";
+ else if (Ty.isX86FP80())
+ OS << "GILLT_x86fp80";
+ else if (Ty.isFloat())
+ OS << "GILLT_f" << Ty.getSizeInBits();
+ else if (Ty.isInteger())
+ OS << "GILLT_i" << Ty.getSizeInBits();
+ else
+ OS << "GILLT_s" << Ty.getSizeInBits();
return;
}
if (Ty.isVector()) {
OS << (Ty.isScalable() ? "GILLT_nxv" : "GILLT_v")
- << Ty.getElementCount().getKnownMinValue() << "s"
- << Ty.getScalarSizeInBits();
+ << Ty.getElementCount().getKnownMinValue();
+
+ LLT ElemTy = Ty.getElementType();
+ if (ElemTy.isBFloat())
+ OS << "bf16";
+ else if (ElemTy.isPPCF128())
+ OS << "ppcf128";
+ else if (ElemTy.isX86FP80())
+ OS << "x86fp80";
+ else if (ElemTy.isFloat())
+ OS << "f" << ElemTy.getSizeInBits();
+ else if (Ty.isInteger())
+ OS << "i" << ElemTy.getSizeInBits();
+ else
+ OS << "s" << ElemTy.getSizeInBits();
return;
}
+
if (Ty.isPointer()) {
OS << "GILLT_p" << Ty.getAddressSpace();
if (Ty.getSizeInBits() > 0)
OS << "s" << Ty.getSizeInBits();
return;
}
+
llvm_unreachable("Unhandled LLT");
}
void LLTCodeGen::emitCxxConstructorCall(raw_ostream &OS) const {
if (Ty.isScalar()) {
- OS << "LLT::scalar(" << Ty.getSizeInBits() << ")";
+ if (Ty.isInteger())
+ OS << "LLT::integer(" << Ty.getScalarSizeInBits() << ")";
+ else if (Ty.isBFloat())
+ OS << "LLT::bfloat()";
+ else if (Ty.isPPCF128())
+ OS << "LLT::ppcf128()";
+ else if (Ty.isX86FP80())
+ OS << "LLT::x86fp80()";
+ else if (Ty.isFloat())
+ OS << "LLT::floatingPoint(" << Ty.getScalarSizeInBits()
+ << ", LLT::FPInfo::IEEE_FLOAT)";
return;
}
+
if (Ty.isVector()) {
OS << "LLT::vector("
<< (Ty.isScalable() ? "ElementCount::getScalable("
: "ElementCount::getFixed(")
- << Ty.getElementCount().getKnownMinValue() << "), "
- << Ty.getScalarSizeInBits() << ")";
+ << Ty.getElementCount().getKnownMinValue() << "), ";
+
+ LLT ElemTy = Ty.getElementType();
+ if (ElemTy.isInteger())
+ OS << "LLT::integer(" << ElemTy.getScalarSizeInBits() << ")";
+ else if (ElemTy.isBFloat())
+ OS << "LLT::bfloat()";
+ else if (ElemTy.isPPCF128())
+ OS << "LLT::ppcf128()";
+ else if (ElemTy.isX86FP80())
+ OS << "LLT::x86fp80()";
+ else if (ElemTy.isFloat())
+ OS << "LLT::floatingPoint(" << ElemTy.getScalarSizeInBits()
+ << ", LLT::FPInfo::IEEE_FLOAT)";
+
+ OS << ")";
return;
}
+
if (Ty.isPointer() && Ty.getSizeInBits() > 0) {
OS << "LLT::pointer(" << Ty.getAddressSpace() << ", " << Ty.getSizeInBits()
<< ")";
return;
}
+
llvm_unreachable("Unhandled LLT");
}
@@ -392,36 +445,7 @@ void LLTCodeGen::emitCxxConstructorCall(raw_ostream &OS) const {
/// particular logic behind the order but either A < B or B < A must be
/// true if A != B.
bool LLTCodeGen::operator<(const LLTCodeGen &Other) const {
- if (Ty.isValid() != Other.Ty.isValid())
- return Ty.isValid() < Other.Ty.isValid();
- if (!Ty.isValid())
- return false;
-
- if (Ty.isVector() != Other.Ty.isVector())
- return Ty.isVector() < Other.Ty.isVector();
- if (Ty.isScalar() != Other.Ty.isScalar())
- return Ty.isScalar() < Other.Ty.isScalar();
- if (Ty.isPointer() != Other.Ty.isPointer())
- return Ty.isPointer() < Other.Ty.isPointer();
-
- if (Ty.isPointer() && Ty.getAddressSpace() != Other.Ty.getAddressSpace())
- return Ty.getAddressSpace() < Other.Ty.getAddressSpace();
-
- if (Ty.isVector() && Ty.getElementCount() != Other.Ty.getElementCount())
- return std::tuple(Ty.isScalable(),
- Ty.getElementCount().getKnownMinValue()) <
- std::tuple(Other.Ty.isScalable(),
- Other.Ty.getElementCount().getKnownMinValue());
-
- assert((!Ty.isVector() || Ty.isScalable() == Other.Ty.isScalable()) &&
- "Unexpected mismatch of scalable property");
- return Ty.isVector()
- ? std::tuple(Ty.isScalable(),
- Ty.getSizeInBits().getKnownMinValue()) <
- std::tuple(Other.Ty.isScalable(),
- Other.Ty.getSizeInBits().getKnownMinValue())
- : Ty.getSizeInBits().getFixedValue() <
- Other.Ty.getSizeInBits().getFixedValue();
+ return Ty.getUniqueRAWLLTData() < Other.Ty.getUniqueRAWLLTData();
}
//===- LLTCodeGen Helpers -------------------------------------------------===//
@@ -430,11 +454,10 @@ std::optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
MVT VT(SVT);
if (VT.isVector() && !VT.getVectorElementCount().isScalar())
- return LLTCodeGen(
- LLT::vector(VT.getVectorElementCount(), VT.getScalarSizeInBits()));
+ return LLTCodeGen(LLT(VT, true));
if (VT.isInteger() || VT.isFloatingPoint())
- return LLTCodeGen(LLT::scalar(VT.getSizeInBits()));
+ return LLTCodeGen(LLT(VT, true));
return std::nullopt;
}
>From d2b00dff3bfa97cf31b65afd324ef436edea7cbc Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 14 Nov 2024 11:32:17 +0000
Subject: [PATCH 03/11] use isScalar and isFixedVector
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +-
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 44 +++----
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 2 +-
.../Target/AMDGPU/AMDGPUCombinerHelper.cpp | 4 +-
.../AMDGPUGlobalISelDivergenceLowering.cpp | 6 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 42 ++++---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 113 ++++++++----------
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 10 +-
.../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 7 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 4 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 34 +++---
11 files changed, 124 insertions(+), 144 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..c258ba3379ec1b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6771,7 +6771,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
LLT TrueTy = MRI.getType(Select->getTrueReg());
// We only do this combine for scalar boolean conditions.
- if (CondTy != LLT::scalar(1))
+ if (!CondTy.isScalar(1))
return false;
if (TrueTy.isPointer())
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d0a62340a5f322..a4239f2567146d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2099,7 +2099,7 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
const unsigned Offset = (I - 1) * PartSize;
Register SrcReg = MI.getOperand(I).getReg();
- assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
+ assert(MRI.getType(SrcReg).isScalar(PartSize));
auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
@@ -6596,7 +6596,7 @@ LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
// If all finite floats fit into the narrowed integer type, we can just swap
// out the result type. This is practically only useful for conversions from
// half to at least 16-bits, so just handle the one case.
- if (SrcTy.getScalarType() != LLT::scalar(16) ||
+ if (!SrcTy.getScalarType().isScalar(16) ||
NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
return UnableToLegalize;
@@ -7471,7 +7471,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
// unsigned cul2f(ulong u) {
// uint lz = clz(u);
@@ -7529,7 +7529,7 @@ LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
// For i64 < INT_MAX we simply reuse SITOFP.
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit
@@ -7563,7 +7563,7 @@ LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+ assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(64));
// We create double value from 32 bit parts with 32 exponent difference.
// Note that + and - are float operations that adjust the implicit leading
@@ -7595,7 +7595,7 @@ LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- if (SrcTy == LLT::scalar(1)) {
+ if (SrcTy.isScalar(1)) {
auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
MIRBuilder.buildSelect(Dst, Src, True, False);
@@ -7603,17 +7603,17 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
return Legalized;
}
- if (SrcTy != LLT::scalar(64))
+ if (!SrcTy.isScalar(64))
return UnableToLegalize;
- if (DstTy == LLT::scalar(32))
+ if (DstTy.isScalar(32))
// TODO: SelectionDAG has several alternative expansions to port which may
// be more reasonable depending on the available instructions. We also need
// a more advanced mechanism to choose an optimal version depending on
// target features such as sitofp or CTLZ availability.
return lowerU64ToF32WithSITOFP(MI);
- if (DstTy == LLT::scalar(64))
+ if (DstTy.isScalar(64))
return lowerU64ToF64BitFloatOps(MI);
return UnableToLegalize;
@@ -7626,7 +7626,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);
- if (SrcTy == S1) {
+ if (SrcTy.isScalar(1)) {
auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
MIRBuilder.buildSelect(Dst, Src, True, False);
@@ -7634,10 +7634,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
return Legalized;
}
- if (SrcTy != S64)
+ if (!SrcTy.isScalar(64))
return UnableToLegalize;
- if (DstTy == S32) {
+ if (DstTy.isScalar(32)) {
// signed cl2f(long l) {
// long s = l >> 63;
// float r = cul2f((l + s) ^ s);
@@ -7664,12 +7664,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
- if (SrcTy != S64 && SrcTy != S32)
+ if (!SrcTy.isScalar(64) && !SrcTy.isScalar(32))
return UnableToLegalize;
- if (DstTy != S32 && DstTy != S64)
+ if (!DstTy.isScalar(32) && !DstTy.isScalar(64))
return UnableToLegalize;
// FPTOSI gives same result as FPTOUI for positive signed integers.
@@ -7704,11 +7702,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
// FIXME: Only f32 to i64 conversions are supported.
- if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
+ if (!SrcTy.getScalarType().isScalar(32) || !DstTy.getScalarType().isScalar(64))
return UnableToLegalize;
// Expand f32 -> i64 conversion
@@ -7873,8 +7869,8 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
auto [Dst, Src] = MI.getFirst2Regs();
- assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
- MRI.getType(Src).getScalarType() == LLT::scalar(64));
+ assert(MRI.getType(Dst).getScalarType().isScalar(16) &&
+ MRI.getType(Src).getScalarType().isScalar(64));
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
@@ -7985,10 +7981,8 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
auto [DstTy, SrcTy] = MI.getFirst2LLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S16 = LLT::scalar(16);
- if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
+ if (DstTy.getScalarType().isScalar(16) && SrcTy.getScalarType().isScalar(64))
return lowerFPTRUNC_F64_TO_F16(MI);
return UnableToLegalize;
@@ -9263,7 +9257,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
// The condition was potentially zero extended before, but we want a sign
// extended boolean.
- if (MaskTy != LLT::scalar(1))
+ if (!MaskTy.isScalar(1))
MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
// Continue the sign extension (or truncate) to match the data type.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index bb00442342d843..666615202a4b5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -72,7 +72,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
if (TRI->isSGPRReg(MRI, PhysReg)) {
LLT Ty = MRI.getType(ExtReg);
LLT S32 = LLT::scalar(32);
- if (Ty != S32) {
+ if (!Ty.isScalar(32)) {
// FIXME: We should probably support readfirstlane intrinsics with all
// legal 32-bit types.
assert(Ty.getSizeInBits() == 32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 46194ab46ff6a7..22cd79c8cc2058 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -409,7 +409,7 @@ static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
const MachineInstr *Def = MRI.getVRegDef(Reg);
if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
Register SrcReg = Def->getOperand(1).getReg();
- return MRI.getType(SrcReg) == LLT::scalar(16);
+ return MRI.getType(SrcReg).isScalar(16);
}
if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
@@ -428,7 +428,7 @@ bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
Register Src2) const {
assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
Register SrcReg = MI.getOperand(1).getReg();
- if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
+ if (!MRI.hasOneNonDBGUse(SrcReg) || !MRI.getType(SrcReg).isScalar(32))
return false;
return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index fb258547e8fb90..d96d1f5ad39f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -87,7 +87,7 @@ DivergenceLoweringHelper::DivergenceLoweringHelper(
// _(s1) -> SReg_32/64(s1)
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
- assert(MRI->getType(DstReg) == LLT::scalar(1));
+ assert(MRI->getType(DstReg).isScalar(1));
if (MRI->getRegClassOrNull(DstReg)) {
if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
@@ -100,13 +100,11 @@ void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
void DivergenceLoweringHelper::getCandidatesForLowering(
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
- LLT S1 = LLT::scalar(1);
-
// Add divergent i1 phis to the list
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB.phis()) {
Register Dst = MI.getOperand(0).getReg();
- if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
+ if (MRI->getType(Dst).isScalar(1) && MUI->isDivergent(Dst))
Vreg1Phis.push_back(&MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 926c1e4b23b4a1..4a3cbc9bc00d09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -105,7 +105,7 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
MachineOperand &Src = MI.getOperand(1);
// TODO: This should be legalized to s32 if needed
- if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
+ if (MRI->getType(Dst.getReg()).isScalar(1))
return false;
const TargetRegisterClass *DstRC
@@ -225,7 +225,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
// - divergent S1 G_PHI should go through lane mask merging algorithm
// and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
// - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
- if (DefTy == LLT::scalar(1))
+ if (DefTy.isScalar(1))
return false;
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
@@ -651,9 +651,9 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
// Selection logic below is for V2S16 only.
// For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
Register Dst = MI.getOperand(0).getReg();
- if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
+ if (!MRI->getType(Dst).isFixedVector(2, 16) ||
(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
- SrcTy != LLT::scalar(32)))
+ !SrcTy.isScalar(32)))
return selectImpl(MI, *CoverageInfo);
const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
@@ -991,9 +991,9 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
LLT Ty = MRI->getType(Dst0);
unsigned Opc;
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
Opc = AMDGPU::V_DIV_SCALE_F32_e64;
- else if (Ty == LLT::scalar(64))
+ else if (Ty.isScalar(64))
Opc = AMDGPU::V_DIV_SCALE_F64_e64;
else
return false;
@@ -2305,11 +2305,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- const LLT S1 = LLT::scalar(1);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
const RegisterBank *DstRB;
- if (DstTy == S1) {
+ if (DstTy.isScalar(1)) {
// This is a special case. We don't treat s1 for legalization artifacts as
// vcc booleans.
DstRB = SrcRB;
@@ -2347,7 +2346,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return true;
}
- if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
+ if (DstTy.isFixedVector(2, 16) && SrcTy.isFixedVector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -2639,8 +2638,7 @@ static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
return false;
- assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
- LLT::fixed_vector(2, 16));
+ assert(MRI.getType(Shuffle->getOperand(0).getReg()).isFixedVector(2, 16));
ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);
@@ -2664,8 +2662,8 @@ bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
Register Src = I.getOperand(1).getReg();
- if (MRI->getType(Dst) == LLT::scalar(32) &&
- MRI->getType(Src) == LLT::scalar(16)) {
+ if (MRI->getType(Dst).isScalar(32) &&
+ MRI->getType(Src).isScalar(16)) {
if (isExtractHiElt(*MRI, Src, Src)) {
MachineBasicBlock *BB = I.getParent();
BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
@@ -2693,7 +2691,7 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
- MRI->getType(Dst) != LLT::scalar(64))
+ !MRI->getType(Dst).isScalar(64))
return false;
Register Src = MI.getOperand(1).getReg();
@@ -2739,7 +2737,7 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
- MRI->getType(Dst) != LLT::scalar(64))
+ !MRI->getType(Dst).isScalar(64))
return false;
Register Src = MI.getOperand(1).getReg();
@@ -2911,7 +2909,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
// RegBankSelect knows what it's doing if the branch condition is scc, even
// though it currently does not.
if (!isVCC(CondReg, *MRI)) {
- if (MRI->getType(CondReg) != LLT::scalar(32))
+ if (!MRI->getType(CondReg).isScalar(32))
return false;
CondPhysReg = AMDGPU::SCC;
@@ -3374,7 +3372,7 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
Register ZExtSrc;
if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ return MRI.getType(ZExtSrc).isScalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
@@ -3382,7 +3380,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ MRI.getType(Def->getOperand(0).getReg()).isScalar(64));
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
@@ -3972,7 +3970,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
// This is a workaround. For extension from type i1, `selectImpl()` uses
// patterns from TD file and generates an illegal VGPR to SGPR COPY as type
// i1 can only be hold in a SGPR class.
- if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
+ if (!MRI->getType(I.getOperand(1).getReg()).isScalar(1) &&
selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
@@ -4199,7 +4197,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
if (MI->getOpcode() == AMDGPU::G_FNEG &&
// It's possible to see an f32 fneg here, but unlikely.
// TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
+ MRI.getType(Src).isFixedVector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
@@ -5697,7 +5695,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
if (!EncodedOffset)
return std::nullopt;
- assert(MRI->getType(SOffset) == LLT::scalar(32));
+ assert(MRI->getType(SOffset).isScalar(32));
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
@@ -5712,7 +5710,7 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
- assert(MRI->getType(Src) == LLT::scalar(16));
+ assert(MRI->getType(Src).isScalar(16));
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9836e10c36bc5d..439cc78ed705e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -275,7 +275,7 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
if (!QueryTy.isVector())
return false;
const LLT EltTy = QueryTy.getElementType();
- return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
+ return EltTy.isScalar(16) || EltTy.getSizeInBits() >= 32;
};
}
@@ -2470,7 +2470,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
// result = trunc(src)
// if (src > 0.0 && src != result)
@@ -2533,7 +2533,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
const LLT S64 = LLT::scalar(64);
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
// TODO: Should this use extract since the low half is unused?
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
@@ -2580,12 +2580,12 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64);
+ assert(MRI.getType(Src).isScalar(64));
auto Unmerge = B.buildUnmerge({S32, S32}, Src);
auto ThirtyTwo = B.buildConstant(S32, 32);
- if (MRI.getType(Dst) == S64) {
+ if (MRI.getType(Dst).isScalar(64)) {
auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
: B.buildUITOFP(S64, Unmerge.getReg(1));
@@ -2598,7 +2598,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
return true;
}
- assert(MRI.getType(Dst) == S32);
+ assert(MRI.getType(Dst).isScalar(32));
auto One = B.buildConstant(S32, 1);
@@ -2639,7 +2639,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
const LLT S32 = LLT::scalar(32);
const LLT SrcLT = MRI.getType(Src);
- assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
+ assert((SrcLT.isScalar(32) || SrcLT.isScalar(64)) && MRI.getType(Dst).isScalar(64));
unsigned Flags = MI.getFlags();
@@ -2654,7 +2654,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
//
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
MachineInstrBuilder Sign;
- if (Signed && SrcLT == S32) {
+ if (Signed && SrcLT.isScalar(32)) {
// However, a 32-bit floating point number has only 23 bits mantissa and
// it's not enough to hold all the significant bits of `lof` if val is
// negative. To avoid the loss of precision, We need to take the absolute
@@ -2664,7 +2664,7 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
Trunc = B.buildFAbs(S32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
- if (SrcLT == S64) {
+ if (SrcLT.isScalar(64)) {
K0 = B.buildFConstant(
S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
K1 = B.buildFConstant(
@@ -2680,11 +2680,11 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
+ auto Hi = (Signed && SrcLT.isScalar(64)) ? B.buildFPTOSI(S32, FloorMul)
: B.buildFPTOUI(S32, FloorMul);
auto Lo = B.buildFPTOUI(S32, Fma);
- if (Signed && SrcLT == S32) {
+ if (Signed && SrcLT.isScalar(32)) {
// Flip the result based on the signedness, which is either all 0s or 1s.
Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
// r := xor({lo, hi}, sign) - sign;
@@ -3257,7 +3257,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
break;
}
case TargetOpcode::G_FPEXT: {
- return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
+ return MRI.getType(DefMI->getOperand(1).getReg()).isScalar(16);
}
default:
return false;
@@ -3314,7 +3314,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();
- if (Ty == LLT::scalar(16)) {
+ if (Ty.isScalar(16)) {
const LLT F32 = LLT::scalar(32);
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
@@ -3326,7 +3326,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
return true;
}
- assert(Ty == LLT::scalar(32));
+ assert(Ty.isScalar(32));
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
if (!ScaledInput) {
@@ -3473,7 +3473,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
LLT Ty = B.getMRI()->getType(Dst);
- if (Ty == LLT::scalar(32)) {
+ if (Ty.isScalar(32)) {
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
if (ScaledInput) {
auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
@@ -3496,7 +3496,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
}
}
- auto Log2Operand = Ty == LLT::scalar(16)
+ auto Log2Operand = Ty.isScalar(16)
? B.buildFLog2(Ty, Src, Flags)
: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
@@ -3856,13 +3856,13 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
Register Dst = MI.getOperand(0).getReg();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
- assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
+ assert(MRI.getType(Dst).isFixedVector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
- assert(MRI.getType(Src0) == S32);
+ assert(MRI.getType(Src0).isScalar(32));
Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
}
@@ -4453,7 +4453,7 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
Align Alignment) const {
Register DstReg = MI.getOperand(0).getReg();
- assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
+ assert(B.getMRI()->getType(DstReg).isScalar(32) &&
"unexpected kernarg parameter type");
Register Ptr = getKernargParameterPtr(B, Offset);
@@ -4470,15 +4470,12 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
- LLT S16 = LLT::scalar(16);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
- if (DstTy == S16)
+ if (DstTy.isScalar(16))
return legalizeFDIV16(MI, MRI, B);
- if (DstTy == S32)
+ if (DstTy.isScalar(32))
return legalizeFDIV32(MI, MRI, B);
- if (DstTy == S64)
+ if (DstTy.isScalar(64))
return legalizeFDIV64(MI, MRI, B);
return false;
@@ -4706,16 +4703,14 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
}
}
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty == S32)
+ if (Ty.isScalar(32))
legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
- else if (Ty == S64)
+ else if (Ty.isScalar(64))
legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
@@ -4727,11 +4722,10 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty != S32 && Ty != S64)
+ if (!Ty.isScalar(32) && !Ty.isScalar(64))
return false;
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
@@ -4771,7 +4765,7 @@ bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
}
}
- if (Ty == S32)
+ if (Ty.isScalar(32))
legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
@@ -4806,7 +4800,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
MF.getTarget().Options.UnsafeFPMath;
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
- if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
+ if (!AllowInaccurateRcp && !ResTy.isScalar(16))
return false;
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
@@ -4840,7 +4834,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
// For f16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
+ if (!AllowInaccurateRcp && (!ResTy.isScalar(16) ||
!MI.getFlag(MachineInstr::FmArcp)))
return false;
@@ -5154,7 +5148,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT Ty = MRI.getType(Res0);
- LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
+ LLT InstrExpTy = Ty.isScalar(16) ? LLT::scalar(16) : LLT::scalar(32);
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
@@ -5401,11 +5395,11 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
return legalizeFSQRTF32(MI, MRI, B);
- if (Ty == LLT::scalar(64))
+ if (Ty.isScalar(64))
return legalizeFSQRTF64(MI, MRI, B);
- if (Ty == LLT::scalar(16))
+ if (Ty.isScalar(16))
return legalizeFSQRTF16(MI, MRI, B);
return false;
}
@@ -5429,9 +5423,9 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
LLT Ty = MRI.getType(Dst);
const fltSemantics *FltSemantics;
- if (Ty == LLT::scalar(32))
+ if (Ty.isScalar(32))
FltSemantics = &APFloat::IEEEsingle();
- else if (Ty == LLT::scalar(64))
+ else if (Ty.isScalar(64))
FltSemantics = &APFloat::IEEEdouble();
else
return false;
@@ -5777,7 +5771,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
- assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
+ assert(StoreVT.isVector() && StoreVT.getElementType().isScalar(16));
if (ST.hasUnpackedD16VMem()) {
auto Unmerge = B.buildUnmerge(S16, Reg);
@@ -5826,7 +5820,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
llvm_unreachable("invalid data type");
}
- if (StoreVT == LLT::fixed_vector(3, S16)) {
+ if (StoreVT.isFixedVector(3, 16)) {
Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
.getReg(0);
}
@@ -5839,8 +5833,6 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
MachineRegisterInfo *MRI = B.getMRI();
LLT Ty = MRI->getType(VData);
- const LLT S16 = LLT::scalar(16);
-
// Fixup buffer resources themselves needing to be v4i128.
if (hasBufferRsrcWorkaround(Ty))
return castBufferRsrcToV4I32(VData, B);
@@ -5850,13 +5842,13 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
VData = B.buildBitcast(Ty, VData).getReg(0);
}
// Fixup illegal register types for i8 stores.
- if (Ty == LLT::scalar(8) || Ty == S16) {
+ if (Ty.isScalar(8) || Ty.isScalar(16)) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
return AnyExt;
}
if (Ty.isVector()) {
- if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
+ if (Ty.getElementType().isScalar(16) && Ty.getNumElements() <= 4) {
if (IsFormat)
return handleD16VData(B, *MRI, VData);
}
@@ -6315,7 +6307,7 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
(I >= Intr->CoordStart && !IsA16)) {
if ((I < Intr->GradientStart) && IsA16 &&
- (B.getMRI()->getType(AddrReg) == S16)) {
+ (B.getMRI()->getType(AddrReg).isScalar(16))) {
assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
@@ -6365,7 +6357,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
if (SrcOp.isReg()) {
AddrRegs.push_back(SrcOp.getReg());
- assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
+ assert(B.getMRI()->getType(SrcOp.getReg()).isScalar(32));
}
}
@@ -6435,9 +6427,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT AddrTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 =
- ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
- const bool IsA16 = AddrTy == S16;
- const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
+ ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isScalar(16)) : GradTy.isScalar(16);
+ const bool IsA16 = AddrTy.isScalar(16);
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isScalar(16);
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6684,14 +6676,14 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsTFE) {
Dst1Reg = MI.getOperand(1).getReg();
- if (MRI->getType(Dst1Reg) != S32)
+ if (!MRI->getType(Dst1Reg).isScalar(32))
return false;
// TODO: Make sure the TFE operand bit is set.
MI.removeOperand(1);
// Handle the easy case that requires no repack instructions.
- if (Ty == S32) {
+ if (Ty.isScalar(32)) {
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
return true;
}
@@ -6726,7 +6718,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
// Avoid a build/concat_vector of 1 entry.
- if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
+ if (Ty.isFixedVector(2, 16) && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
B.buildBitcast(DstReg, ResultRegs[0]);
return true;
}
@@ -6739,7 +6731,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
//
// TODO: We don't really need to use load s32 elements. We would only need one
// cast for the TFE result if a multiple of v2s16 was used.
- if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
+ if (!RegTy.isFixedVector(2, 16) && !ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
Reg = B.buildBitcast(V2S16, Reg).getReg(0);
} else if (ST.hasUnpackedD16VMem()) {
@@ -6764,12 +6756,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return true;
}
- assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
+ assert(!ST.hasUnpackedD16VMem() && ResTy.isFixedVector(2, 16));
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
// Deal with the one annoying legal case.
- const LLT V3S16 = LLT::fixed_vector(3, 16);
- if (Ty == V3S16) {
+ if (Ty.isFixedVector(3, 16)) {
if (IsTFE) {
if (ResultRegs.size() == 1) {
NewResultReg = ResultRegs[0];
@@ -7228,7 +7219,7 @@ bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
- if (MRI.getType(Src) != S64)
+ if (!MRI.getType(Src).isScalar(64))
return false;
auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
@@ -7536,7 +7527,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
+ if (!MRI.getType(Index).isScalar(32))
MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
@@ -7545,7 +7536,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
LLT S32 = LLT::scalar(32);
- if (MRI.getType(Index) != S32)
+ if (!MRI.getType(Index).isScalar(32))
MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 888817e52e35d4..107d0f8c495032 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -213,7 +213,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
LLT Ty = MRI.getType(DstReg);
- if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
+ if (Ty.isScalar(32) || Ty.isScalar(16)) {
Register SrcReg = MI.getOperand(1).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
@@ -232,10 +232,10 @@ void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
- if (SrcTy != S32)
+ if (!SrcTy.isScalar(32))
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
- if (Ty == S32) {
+ if (Ty.isScalar(32)) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
MI.getFlags());
} else {
@@ -349,7 +349,7 @@ void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
const LLT S32 = LLT::scalar(32);
Register CvtSrc = MatchInfo.CvtVal;
LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
- if (SrcTy != S32) {
+ if (!SrcTy.isScalar(32)) {
assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
}
@@ -418,7 +418,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
MachineInstr &MI, unsigned &NewOpcode) const {
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
- if (MRI.getType(Src0) != LLT::scalar(64))
+ if (!MRI.getType(Src0).isScalar(64))
return false;
if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 52c6e5274ae5b7..cf742511f916ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -119,11 +119,11 @@ bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
// Try to find a pattern where an i64 value should get clamped to short.
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
- if (SrcType != LLT::scalar(64))
+ if (!SrcType.isScalar(64))
return false;
const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- if (DstType != LLT::scalar(16))
+ if (!DstType.isScalar(16))
return false;
Register Base;
@@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
Register Src = MatchInfo.Origin;
- assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
- LLT::scalar(64));
+ assert(MI.getParent()->getParent()->getRegInfo().getType(Src).isScalar(64));
const LLT S32 = LLT::scalar(32);
auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 98c48f4fe3705b..68312ef657af3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -195,7 +195,7 @@ bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
// med3 for i16 is only available on gfx9+, and not available for v2i16.
LLT Ty = MRI.getType(Dst);
- if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
+ if ((!Ty.isScalar(16) || !STI.hasMed3_16()) && !Ty.isScalar(32))
return false;
MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
@@ -238,7 +238,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
LLT Ty = MRI.getType(Dst);
// med3 for f16 is only available on gfx9+, and not available for v2f16.
- if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
+ if ((!Ty.isScalar(16) || !STI.hasMed3_16()) && !Ty.isScalar(32))
return false;
auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 224c368cff4a1f..9ffb23501064fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -131,8 +131,8 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
if (SrcBank == &AMDGPU::VCCRegBank) {
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(SrcReg) == LLT::scalar(1));
- assert(MRI.getType(DstReg) == S32);
+ assert(MRI.getType(SrcReg).isScalar(1));
+ assert(MRI.getType(DstReg).isScalar(32));
assert(NewBank == &AMDGPU::VGPRRegBank);
// Replace the extension with a select, which really uses the boolean
@@ -170,7 +170,7 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
continue;
const RegisterBank *RB = NewBank;
- if (MRI.getType(Reg) == LLT::scalar(1)) {
+ if (MRI.getType(Reg).isScalar(1)) {
assert(NewBank == &AMDGPU::VGPRRegBank &&
"s1 operands should only be used for vector bools");
assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
@@ -298,7 +298,7 @@ AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
if (!Ty.isValid())
return AMDGPU::SGPRRegBank;
- return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
+ return Ty.isScalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
}
return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
@@ -1495,7 +1495,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
- if (Ty == S32)
+ if (Ty.isScalar(32))
return true;
// There is no 64-bit vgpr bitfield extract instructions so the operation
@@ -1568,7 +1568,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
// TODO: It might be worth using a pseudo here to avoid scc clobber and
// register class constraints.
- unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
+ unsigned Opc = Ty.isScalar(32) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
@@ -1790,7 +1790,7 @@ Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
const LLT S16 = LLT::scalar(16);
LLT StoreVT = MRI.getType(Reg);
- if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
+ if (!StoreVT.isVector() || !StoreVT.getElementType().isScalar(16))
return Reg;
auto Unmerge = B.buildUnmerge(S16, Reg);
@@ -2213,7 +2213,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_IMPLICIT_DEF: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::scalar(1))
+ if (!DstTy.isScalar(1))
break;
const RegisterBank *DstBank =
@@ -2243,7 +2243,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_PHI: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::scalar(1))
+ if (!DstTy.isScalar(1))
break;
const LLT S32 = LLT::scalar(32);
@@ -2514,7 +2514,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
- if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
+ if (!DstTy.isScalar(16) && !DstTy.isFixedVector(2, 16))
break;
const RegisterBank *DstBank =
@@ -2588,7 +2588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg1 = MI.getOperand(2).getReg();
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
+ assert(MRI.getType(DstReg).isScalar(64) && "This is a special case for s_mul_u64 "
"that handles only 64-bit operands.");
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -2684,7 +2684,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
- if (Ty == S32)
+ if (Ty.isScalar(32))
break;
ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
@@ -2708,7 +2708,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
- if (Ty == S32)
+ if (Ty.isScalar(32))
break;
// We can narrow this more efficiently than Helper can by using ffbh/ffbl
@@ -2776,7 +2776,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
- if (SrcTy != LLT::scalar(1))
+ if (!SrcTy.isScalar(1))
return;
// It is not legal to have a legalization artifact with a VCC source. Rather
@@ -3788,9 +3788,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// we need to take the virtual register's type as a hint on how to interpret
// s1 values.
if (!SrcReg.isVirtual() && !DstBank &&
- MRI.getType(DstReg) == LLT::scalar(1))
+ MRI.getType(DstReg).isScalar(1))
DstBank = &AMDGPU::VCCRegBank;
- else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
+ else if (!DstReg.isVirtual() && MRI.getType(SrcReg).isScalar(1))
DstBank = &AMDGPU::VCCRegBank;
if (!DstBank)
@@ -4154,7 +4154,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- if (DstTy == LLT::fixed_vector(2, 16)) {
+ if (DstTy.isFixedVector(2, 16)) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
>From 24eef6e0e582fe3538c9236586716735761a37a4 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 5 Dec 2024 14:29:20 +0000
Subject: [PATCH 04/11] fix MIRBuilder
---
llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index be347006a81f92..adfec6f35d4757 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -563,7 +563,7 @@ MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc,
Op.getLLTTy(*getMRI()).getSizeInBits())
Opcode = TargetOpcode::G_TRUNC;
else
- assert(Res.getLLTTy(*getMRI()) == Op.getLLTTy(*getMRI()));
+ assert(Res.getLLTTy(*getMRI()).getSizeInBits() == Op.getLLTTy(*getMRI()).getSizeInBits());
return buildInstr(Opcode, Res, Op);
}
>From 246a3417f0a54c64258de0469abf6510b95d1c99 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Thu, 5 Dec 2024 13:50:38 +0000
Subject: [PATCH 05/11] FPInfo: IRTranslator and CallLowering
---
llvm/include/llvm/CodeGen/Analysis.h | 2 +-
.../llvm/CodeGen/GlobalISel/IRTranslator.h | 4 +-
llvm/lib/CodeGen/Analysis.cpp | 8 +-
llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 23 +++--
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 95 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 37 ++++++--
6 files changed, 100 insertions(+), 69 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 362cc30bbd06a1..837a41437d517d 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -95,7 +95,7 @@ inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
/// with the in-memory offsets of each of the individual values.
///
void computeValueLLTs(const DataLayout &DL, Type &Ty,
- SmallVectorImpl<LLT> &ValueTys,
+ SmallVectorImpl<LLT> &ValueTys, bool EnableFPInfo,
SmallVectorImpl<uint64_t> *Offsets = nullptr,
uint64_t StartingOffset = 0);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 6fd05c8fddd5f8..983758e3065604 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -619,6 +619,8 @@ class IRTranslator : public MachineFunctionPass {
CodeGenOptLevel OptLevel;
+ bool EnableFPInfo;
+
/// Current optimization remark emitter. Used to report failures.
std::unique_ptr<OptimizationRemarkEmitter> ORE;
@@ -772,7 +774,7 @@ class IRTranslator : public MachineFunctionPass {
BranchProbability Prob = BranchProbability::getUnknown());
public:
- IRTranslator(CodeGenOptLevel OptLevel = CodeGenOptLevel::None);
+ IRTranslator(CodeGenOptLevel OptLevel = CodeGenOptLevel::None, bool EnableFPInfo = false);
StringRef getPassName() const override { return "IRTranslator"; }
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e7b9417de8c9f7..7a433354cdfaac 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -139,7 +139,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
}
void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
- SmallVectorImpl<LLT> &ValueTys,
+ SmallVectorImpl<LLT> &ValueTys, bool EnableFPInfo,
SmallVectorImpl<uint64_t> *Offsets,
uint64_t StartingOffset) {
// Given a struct type, recursively traverse the elements.
@@ -150,7 +150,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I) {
uint64_t EltOffset = SL ? SL->getElementOffset(I) : 0;
- computeValueLLTs(DL, *STy->getElementType(I), ValueTys, Offsets,
+ computeValueLLTs(DL, *STy->getElementType(I), ValueTys, EnableFPInfo, Offsets,
StartingOffset + EltOffset);
}
return;
@@ -160,7 +160,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
Type *EltTy = ATy->getElementType();
uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue();
for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
- computeValueLLTs(DL, *EltTy, ValueTys, Offsets,
+ computeValueLLTs(DL, *EltTy, ValueTys, EnableFPInfo, Offsets,
StartingOffset + i * EltSize);
return;
}
@@ -168,7 +168,7 @@ void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
if (Ty.isVoidTy())
return;
// Base case: we can get an LLT for this LLVM IR type.
- ValueTys.push_back(getLLTForType(Ty, DL));
+ ValueTys.push_back(getLLTForType(Ty, DL, EnableFPInfo));
if (Offsets != nullptr)
Offsets->push_back(StartingOffset * 8);
}
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index d17b20d977ce99..32702ee465fb49 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -158,7 +158,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
if (const Function *F = dyn_cast<Function>(CalleeV)) {
if (F->hasFnAttribute(Attribute::NonLazyBind)) {
- LLT Ty = getLLTForType(*F->getType(), DL);
+ LLT Ty = getLLTForType(*F->getType(), DL, /* EnableFPInfo */ true);
Register Reg = MIRBuilder.buildGlobalValue(Ty, F).getReg(0);
Info.Callee = MachineOperand::CreateReg(Reg, false);
} else {
@@ -780,11 +780,11 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
const MVT ValVT = VA.getValVT();
const MVT LocVT = VA.getLocVT();
- const LLT LocTy(LocVT);
- const LLT ValTy(ValVT);
+ const LLT LocTy(LocVT, /* EnableFPInfo */ true);
+ const LLT ValTy(ValVT, /* EnableFPInfo */ true);
const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : ValTy;
const EVT OrigVT = EVT::getEVT(Args[i].Ty);
- const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
+ const LLT OrigTy = getLLTForType(*Args[i].Ty, DL, /* EnableFPInfo */ true);
const LLT PointerTy = LLT::pointer(
AllocaAddressSpace, DL.getPointerSizeInBits(AllocaAddressSpace));
@@ -822,8 +822,11 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
if (!Handler.isIncomingArgumentHandler() && OrigTy != ValTy &&
VA.getLocInfo() != CCValAssign::Indirect) {
assert(Args[i].OrigRegs.size() == 1);
+ unsigned ExtendOp = extendOpFromFlags(Args[i].Flags[0]);
+ if (OrigTy.isFloat() && ValTy.isFloat())
+ ExtendOp = TargetOpcode::G_FPEXT;
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
- ValTy, extendOpFromFlags(Args[i].Flags[0]));
+ ValTy, ExtendOp);
}
bool IndirectParameterPassingHandled = false;
@@ -1003,7 +1006,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
Type *RetPtrTy =
PointerType::get(RetTy->getContext(), DL.getAllocaAddrSpace());
- LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL);
+ LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL, /* EnableFPInfo */ true);
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -1033,7 +1036,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
unsigned NumValues = SplitVTs.size();
Align BaseAlign = DL.getPrefTypeAlign(RetTy);
unsigned AS = DL.getAllocaAddrSpace();
- LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getContext(), AS), DL);
+ LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getContext(), AS), DL, /* EnableFPInfo */ true);
MachinePointerInfo PtrInfo(AS);
@@ -1291,8 +1294,8 @@ void CallLowering::ValueHandler::copyArgumentMemory(
Register CallLowering::ValueHandler::extendRegister(Register ValReg,
const CCValAssign &VA,
unsigned MaxSizeBits) {
- LLT LocTy{VA.getLocVT()};
- LLT ValTy{VA.getValVT()};
+ LLT LocTy(VA.getLocVT(), /* EnableFPInfo */ true);
+ LLT ValTy(VA.getValVT(), /* EnableFPInfo */ true);
if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
return ValReg;
@@ -1383,7 +1386,7 @@ static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
void CallLowering::IncomingValueHandler::assignValueToReg(
Register ValVReg, Register PhysReg, const CCValAssign &VA) {
const MVT LocVT = VA.getLocVT();
- const LLT LocTy(LocVT);
+ const LLT LocTy(LocVT, true);
const LLT RegTy = MRI.getType(ValVReg);
if (isCopyCompatibleType(RegTy, LocTy)) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f668e41094bbc8..28041033e6df57 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -129,8 +129,8 @@ static void reportTranslationError(MachineFunction &MF,
ORE.emit(R);
}
-IRTranslator::IRTranslator(CodeGenOptLevel optlevel)
- : MachineFunctionPass(ID), OptLevel(optlevel) {}
+IRTranslator::IRTranslator(CodeGenOptLevel optlevel, bool EnableFPInfo)
+ : MachineFunctionPass(ID), OptLevel(optlevel), EnableFPInfo(EnableFPInfo) {}
#ifndef NDEBUG
namespace {
@@ -194,7 +194,7 @@ IRTranslator::allocateVRegs(const Value &Val) {
auto *Regs = VMap.getVRegs(Val);
auto *Offsets = VMap.getOffsets(Val);
SmallVector<LLT, 4> SplitTys;
- computeValueLLTs(*DL, *Val.getType(), SplitTys,
+ computeValueLLTs(*DL, *Val.getType(), SplitTys, EnableFPInfo,
Offsets->empty() ? Offsets : nullptr);
for (unsigned i = 0; i < SplitTys.size(); ++i)
Regs->push_back(0);
@@ -218,7 +218,7 @@ ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
"Don't know how to create an empty vreg");
SmallVector<LLT, 4> SplitTys;
- computeValueLLTs(*DL, *Val.getType(), SplitTys,
+ computeValueLLTs(*DL, *Val.getType(), SplitTys, EnableFPInfo,
Offsets->empty() ? Offsets : nullptr);
if (!isa<Constant>(Val)) {
@@ -840,7 +840,7 @@ void IRTranslator::emitJumpTable(SwitchCG::JumpTable &JT,
MIB.setDebugLoc(CurBuilder->getDebugLoc());
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
auto Table = MIB.buildJumpTable(PtrTy, JT.JTI);
MIB.buildBrJT(Table.getReg(0), JT.JTI, JT.Reg);
@@ -855,7 +855,7 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
const Value &SValue = *JTH.SValue;
// Subtract the lowest switch case value from the value being switched on.
- const LLT SwitchTy = getLLTForType(*SValue.getType(), *DL);
+ const LLT SwitchTy = getLLTForType(*SValue.getType(), *DL, EnableFPInfo);
Register SwitchOpReg = getOrCreateVReg(SValue);
auto FirstCst = MIB.buildConstant(SwitchTy, JTH.First);
auto Sub = MIB.buildSub({SwitchTy}, SwitchOpReg, FirstCst);
@@ -863,7 +863,7 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
// This value may be smaller or larger than the target's pointer type, and
// therefore require extension or truncating.
auto *PtrIRTy = PointerType::getUnqual(SValue.getContext());
- const LLT PtrScalarTy = LLT::scalar(DL->getTypeSizeInBits(PtrIRTy));
+ const LLT PtrScalarTy = LLT::integer(DL->getTypeSizeInBits(PtrIRTy));
Sub = MIB.buildZExtOrTrunc(PtrScalarTy, Sub);
JT.Reg = Sub.getReg(0);
@@ -880,7 +880,8 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
auto Cst = getOrCreateVReg(
*ConstantInt::get(SValue.getType(), JTH.Last - JTH.First));
Cst = MIB.buildZExtOrTrunc(PtrScalarTy, Cst).getReg(0);
- auto Cmp = MIB.buildICmp(CmpInst::ICMP_UGT, LLT::scalar(1), Sub, Cst);
+ LLT CmpTy = LLT::integer(1);
+ auto Cmp = MIB.buildICmp(CmpInst::ICMP_UGT, CmpTy, Sub, Cst);
auto BrCond = MIB.buildBrCond(Cmp.getReg(0), *JT.Default);
@@ -911,7 +912,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
return;
}
- const LLT i1Ty = LLT::scalar(1);
+ const LLT i1Ty = LLT::integer(1);
// Build the compare.
if (!CB.CmpMHS) {
const auto *CI = dyn_cast<ConstantInt>(CB.CmpRHS);
@@ -1088,19 +1089,19 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
auto RangeSub = MIB.buildSub(SwitchOpTy, SwitchOpReg, MinValReg);
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
LLT MaskTy = SwitchOpTy;
if (MaskTy.getSizeInBits() > PtrTy.getSizeInBits() ||
!llvm::has_single_bit<uint32_t>(MaskTy.getSizeInBits()))
- MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+ MaskTy = LLT::integer(PtrTy.getSizeInBits());
else {
// Ensure that the type will fit the mask value.
for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
// Switch table case range are encoded into series of masks.
// Just use pointer type, it's guaranteed to fit.
- MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+ MaskTy = LLT::integer(PtrTy.getSizeInBits());
break;
}
}
@@ -1109,7 +1110,7 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
if (SwitchOpTy != MaskTy)
SubReg = MIB.buildZExtOrTrunc(MaskTy, SubReg).getReg(0);
- B.RegVT = getMVTForLLT(MaskTy);
+ B.RegVT = getMVTForLLT(MaskTy, EnableFPInfo);
B.Reg = SubReg;
MachineBasicBlock *MBB = B.Cases[0].ThisBB;
@@ -1123,7 +1124,8 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
if (!B.FallthroughUnreachable) {
// Conditional branch to the default block.
auto RangeCst = MIB.buildConstant(SwitchOpTy, B.Range);
- auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, LLT::scalar(1),
+ LLT CmpTy = LLT::integer(1);
+ auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, CmpTy,
RangeSub, RangeCst);
MIB.buildBrCond(RangeCmp, *B.Default);
}
@@ -1141,7 +1143,8 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
MachineIRBuilder &MIB = *CurBuilder;
MIB.setMBB(*SwitchBB);
- LLT SwitchTy = getLLTForMVT(BB.RegVT);
+ LLT SwitchTy = getLLTForMVT(BB.RegVT, EnableFPInfo);
+ LLT I1 = LLT::integer(1);
Register Cmp;
unsigned PopCount = llvm::popcount(B.Mask);
if (PopCount == 1) {
@@ -1150,13 +1153,13 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
auto MaskTrailingZeros =
MIB.buildConstant(SwitchTy, llvm::countr_zero(B.Mask));
Cmp =
- MIB.buildICmp(ICmpInst::ICMP_EQ, LLT::scalar(1), Reg, MaskTrailingZeros)
+ MIB.buildICmp(ICmpInst::ICMP_EQ, I1, Reg, MaskTrailingZeros)
.getReg(0);
} else if (PopCount == BB.Range) {
// There is only one zero bit in the range, test for it directly.
auto MaskTrailingOnes =
MIB.buildConstant(SwitchTy, llvm::countr_one(B.Mask));
- Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Reg, MaskTrailingOnes)
+ Cmp = MIB.buildICmp(CmpInst::ICMP_NE, I1, Reg, MaskTrailingOnes)
.getReg(0);
} else {
// Make desired shift.
@@ -1167,7 +1170,7 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
auto CstMask = MIB.buildConstant(SwitchTy, B.Mask);
auto AndOp = MIB.buildAnd(SwitchTy, SwitchVal, CstMask);
auto CstZero = MIB.buildConstant(SwitchTy, 0);
- Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), AndOp, CstZero)
+ Cmp = MIB.buildICmp(CmpInst::ICMP_NE, I1, AndOp, CstZero)
.getReg(0);
}
@@ -1368,7 +1371,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
const Value *Ptr = LI.getPointerOperand();
Type *OffsetIRTy = DL->getIndexType(Ptr->getType());
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
if (CLI->supportSwiftError() && isSwiftError(Ptr)) {
assert(Regs.size() == 1 && "swifterror should be single pointer");
@@ -1415,7 +1418,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
Register Base = getOrCreateVReg(*SI.getPointerOperand());
Type *OffsetIRTy = DL->getIndexType(SI.getPointerOperandType());
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
if (CLI->supportSwiftError() && isSwiftError(SI.getPointerOperand())) {
assert(Vals.size() == 1 && "swifterror should be single pointer");
@@ -1538,8 +1541,8 @@ bool IRTranslator::translateCopy(const User &U, const Value &V,
bool IRTranslator::translateBitCast(const User &U,
MachineIRBuilder &MIRBuilder) {
// If we're bitcasting to the source type, we can reuse the source vreg.
- if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
- getLLTForType(*U.getType(), *DL)) {
+ if (getLLTForType(*U.getOperand(0)->getType(), *DL, EnableFPInfo) ==
+ getLLTForType(*U.getType(), *DL, EnableFPInfo)) {
// If the source is a ConstantInt then it was probably created by
// ConstantHoisting and we should leave it alone.
if (isa<ConstantInt>(U.getOperand(0)))
@@ -1572,9 +1575,9 @@ bool IRTranslator::translateGetElementPtr(const User &U,
Value &Op0 = *U.getOperand(0);
Register BaseReg = getOrCreateVReg(Op0);
Type *PtrIRTy = Op0.getType();
- LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+ LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
Type *OffsetIRTy = DL->getIndexType(PtrIRTy);
- LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
uint32_t Flags = 0;
if (const Instruction *I = dyn_cast<Instruction>(&U))
@@ -1601,9 +1604,9 @@ bool IRTranslator::translateGetElementPtr(const User &U,
BaseReg)
.getReg(0);
PtrIRTy = FixedVectorType::get(PtrIRTy, VectorWidth);
- PtrTy = getLLTForType(*PtrIRTy, *DL);
+ PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
OffsetIRTy = DL->getIndexType(PtrIRTy);
- OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+ OffsetTy = getLLTForType(*OffsetIRTy, *DL, EnableFPInfo);
}
int64_t Offset = 0;
@@ -1651,7 +1654,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
Register GepOffsetReg;
if (ElementSize != 1) {
auto ElementSizeMIB = MIRBuilder.buildConstant(
- getLLTForType(*OffsetIRTy, *DL), ElementSize);
+ getLLTForType(*OffsetIRTy, *DL, EnableFPInfo), ElementSize);
GepOffsetReg =
MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB).getReg(0);
} else
@@ -1696,7 +1699,7 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
SrcRegs.push_back(SrcReg);
}
- LLT SizeTy = LLT::scalar(MinPtrSize);
+ LLT SizeTy = LLT::integer(MinPtrSize);
// The size operand should be the minimum of the pointer sizes.
Register &SizeOpReg = SrcRegs[SrcRegs.size() - 1];
@@ -2313,7 +2316,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
MachineInstr::copyFlagsFromInstruction(CI));
} else {
- LLT Ty = getLLTForType(*CI.getType(), *DL);
+ LLT Ty = getLLTForType(*CI.getType(), *DL, EnableFPInfo);
auto FMul = MIRBuilder.buildFMul(
Ty, Op0, Op1, MachineInstr::copyFlagsFromInstruction(CI));
MIRBuilder.buildFAdd(Dst, FMul, Op2,
@@ -2380,7 +2383,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getStackGuard(getOrCreateVReg(CI), MIRBuilder);
return true;
case Intrinsic::stackprotector: {
- LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+ LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL, EnableFPInfo);
Register GuardVal;
if (TLI->useLoadStackGuardNode(*CI.getModule())) {
GuardVal = MRI->createGenericVirtualRegister(PtrTy);
@@ -2423,7 +2426,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
return true;
}
case Intrinsic::invariant_start: {
- LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+ LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL, EnableFPInfo);
Register Undef = MRI->createGenericVirtualRegister(PtrTy);
MIRBuilder.buildUndef(Undef);
return true;
@@ -2622,7 +2625,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
case Intrinsic::vector_deinterleave2: {
// Both intrinsics have at least one operand.
Value *Op0 = CI.getOperand(0);
- LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
+ LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout(), EnableFPInfo);
if (!ResTy.isFixedVector())
return false;
@@ -2670,7 +2673,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
for (const auto &Arg : CB.args()) {
if (CLI->supportSwiftError() && isSwiftError(Arg)) {
assert(SwiftInVReg == 0 && "Expected only one swift error argument");
- LLT Ty = getLLTForType(*Arg->getType(), *DL);
+ LLT Ty = getLLTForType(*Arg->getType(), *DL, EnableFPInfo);
SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
&CB, &MIRBuilder.getMBB(), Arg));
@@ -2823,8 +2826,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
Align Alignment = Info.align.value_or(
DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
LLT MemTy = Info.memVT.isSimple()
- ? getLLTForMVT(Info.memVT.getSimpleVT())
- : LLT::scalar(Info.memVT.getStoreSizeInBits());
+ ? getLLTForMVT(Info.memVT.getSimpleVT(), EnableFPInfo)
+ : LLT::integer(Info.memVT.getStoreSizeInBits());
// TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
// didn't yield anything useful.
@@ -3030,13 +3033,13 @@ bool IRTranslator::translateLandingPad(const User &U,
if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
- LLT Ty = getLLTForType(*LP.getType(), *DL);
+ LLT Ty = getLLTForType(*LP.getType(), *DL, EnableFPInfo);
Register Undef = MRI->createGenericVirtualRegister(Ty);
MIRBuilder.buildUndef(Undef);
SmallVector<LLT, 2> Tys;
for (Type *Ty : cast<StructType>(LP.getType())->elements())
- Tys.push_back(getLLTForType(*Ty, *DL));
+ Tys.push_back(getLLTForType(*Ty, *DL, EnableFPInfo));
assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
// Mark exception register as live in.
@@ -3170,7 +3173,7 @@ bool IRTranslator::translateInsertElement(const User &U,
if (!Idx)
Idx = getOrCreateVReg(*U.getOperand(2));
if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
- const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ const LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
}
MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
@@ -3213,7 +3216,7 @@ bool IRTranslator::translateInsertVector(const User &U,
if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
// We are inserting an illegal fixed vector into a scalable
// vector, use a scalar element insert.
- LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Register Idx = getOrCreateVReg(*CI);
auto ScaledIndex = MIRBuilder.buildMul(
VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
@@ -3251,7 +3254,7 @@ bool IRTranslator::translateExtractElement(const User &U,
if (!Idx)
Idx = getOrCreateVReg(*U.getOperand(1));
if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
- const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ const LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
}
MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
@@ -3291,7 +3294,7 @@ bool IRTranslator::translateExtractVector(const User &U,
if (isa<ScalableVectorType>(U.getOperand(0)->getType())) {
// We are extracting an illegal fixed vector from a scalable
// vector, use a scalar element extract.
- LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ LLT VecIdxTy = LLT::integer(PreferredVecIdxWidth);
Register Idx = getOrCreateVReg(*CI);
auto ScaledIndex = MIRBuilder.buildMul(
VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx);
@@ -3819,8 +3822,8 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
// First create the loads to the guard/stack slot for the comparison.
Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
- const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
- LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
+ const LLT PtrTy = getLLTForType(*PtrIRTy, *DL, EnableFPInfo);
+ LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL), EnableFPInfo);
MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
int FI = MFI.getStackProtectorIndex();
@@ -3880,8 +3883,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
// If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
// Otherwise, emit a volatile load to retrieve the stack guard value.
if (TLI->useLoadStackGuardNode(*ParentBB->getBasicBlock()->getModule())) {
+ LLT RegTy = LLT::integer(PtrTy.getSizeInBits());
Guard =
- MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
+ MRI->createGenericVirtualRegister(RegTy);
getStackGuard(Guard, *CurBuilder);
} else {
// TODO: test using android subtarget when we support @llvm.thread.pointer.
@@ -3897,8 +3901,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
}
// Perform the comparison.
+ LLT I1 = LLT::integer(1);
auto Cmp =
- CurBuilder->buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Guard, GuardVal);
+ CurBuilder->buildICmp(CmpInst::ICMP_NE, I1, Guard, GuardVal);
// If the guard/stackslot do not equal, branch to failure MBB.
CurBuilder->buildBrCond(Cmp, *SPD.getFailureMBB());
// Otherwise branch to success MBB.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 666615202a4b5f..f3e0c24796599b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -32,10 +32,19 @@ namespace {
/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
Register ValVReg, const CCValAssign &VA) {
- if (VA.getLocVT().getSizeInBits() < 32) {
+ LLT SrcTy = LLT(VA.getLocVT(), /*EnableFPInfo*/ true);
+
+ if (SrcTy.getSizeInBits() < 32) {
+ LLT I32 = LLT::integer(32);
+ LLT DstTy = LLT::integer(SrcTy.getSizeInBits());
+
+ Register SrcReg = ValVReg;
+ if (SrcTy.isFloat())
+ SrcReg = Handler.MIRBuilder.buildBitcast(DstTy, ValVReg).getReg(0);
+
// 16-bit types are reported as legal for 32-bit registers. We need to
// extend and do a 32-bit copy to avoid the verifier complaining about it.
- return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+ return Handler.MIRBuilder.buildAnyExt(I32, SrcReg).getReg(0);
}
return Handler.extendRegister(ValVReg, VA);
@@ -119,16 +128,28 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
const CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
+ LLT LocTy = LLT(VA.getLocVT(), /* EnableFPInfo */ true);
- if (VA.getLocVT().getSizeInBits() < 32) {
+ if (LocTy.getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
- auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ Register CopyReg = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg).getReg(0);
+
+ if (LocTy.getScalarType().isFloat()) {
+ LLT TruncTy = LocTy.isVector()
+ ? LLT::vector(LocTy.getElementCount(),
+ LLT::integer(LocTy.getScalarSizeInBits()))
+ : LLT::integer(LocTy.getScalarSizeInBits());
+
+ auto Extended = buildExtensionHint(VA, CopyReg, TruncTy);
+ auto Trunc = MIRBuilder.buildTrunc(TruncTy, Extended);
+ MIRBuilder.buildBitcast(ValVReg, Trunc.getReg(0));
+ return;
+ }
// If we have signext/zeroext, it applies to the whole 32-bit register
// before truncation.
- auto Extended =
- buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
+ auto Extended = buildExtensionHint(VA, CopyReg, LocTy);
MIRBuilder.buildTrunc(ValVReg, Extended);
return;
}
@@ -332,7 +353,7 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
extOpcodeToISDExtOpcode(ExtendOp));
if (ExtVT != VT) {
RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
- LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
+ LLT ExtTy = getLLTForType(*RetInfo.Ty, DL, /* EnableFPInfo */ true);
Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
}
}
@@ -422,7 +443,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
- LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
+ LLT ArgTy = getLLTForType(*SplitArg.Ty, DL, /* EnableFPInfo */ true);
if (SplitArg.Flags[0].isPointer()) {
// Compensate for losing pointeriness in splitValueTypes.
LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
>From bbb0eb14493c4329af11cbfb952019c138f5dace Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 11 Dec 2024 12:39:18 +0000
Subject: [PATCH 06/11] re-enable bfloat
---
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 28041033e6df57..f196dd8dc8f10c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1556,10 +1556,6 @@ bool IRTranslator::translateBitCast(const User &U,
bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
- if (U.getType()->getScalarType()->isBFloatTy() ||
- U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
- return false;
-
uint32_t Flags = 0;
if (const Instruction *I = dyn_cast<Instruction>(&U))
Flags = MachineInstr::copyFlagsFromInstruction(*I);
>From 3fd93b9c9f97343ce043ccd2238c513e5fb278b3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 11 Dec 2024 12:40:56 +0000
Subject: [PATCH 07/11] temp patch float -> integer
---
llvm/include/llvm/CodeGenTypes/LowLevelType.h | 12 ++++++------
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 ++++++--------
2 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index cf5f740c364d39..d42c4a5ed01fdb 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -139,25 +139,25 @@ class LLT {
}
// Get a 16-bit brain float value.
- static constexpr LLT bfloat() { return integer(16); }
+ static constexpr LLT bfloat() { return floatingPoint(16, FPInfo::VARIANT_FLOAT_1); }
/// Get a 16-bit IEEE half value.
- static constexpr LLT float16() { return integer(16); }
+ static constexpr LLT float16() { return floatingPoint(16, FPInfo::IEEE_FLOAT); }
/// Get a 32-bit IEEE float value.
- static constexpr LLT float32() { return integer(32); }
+ static constexpr LLT float32() { return floatingPoint(32, FPInfo::IEEE_FLOAT); }
/// Get a 64-bit IEEE double value.
- static constexpr LLT float64() { return integer(64); }
+ static constexpr LLT float64() { return floatingPoint(64, FPInfo::IEEE_FLOAT); }
/// Get a 80-bit X86 floating point value.
- static constexpr LLT x86fp80() { return integer(80); }
+ static constexpr LLT x86fp80() { return floatingPoint(80, FPInfo::VARIANT_FLOAT_1); }
/// Get a 128-bit IEEE quad value.
static constexpr LLT float128() { return floatingPoint(128, FPInfo::IEEE_FLOAT); }
/// Get a 128-bit PowerPC double double value.
- static constexpr LLT ppcf128() { return integer(128); }
+ static constexpr LLT ppcf128() { return floatingPoint(128, FPInfo::VARIANT_FLOAT_1); }
/// Get a low-level fixed-width vector of some number of elements and element
/// width.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 439cc78ed705e8..c358641cdef170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -282,10 +282,11 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
+static const LLT F16 = LLT::integer(16);
static const LLT S32 = LLT::scalar(32);
-static const LLT F32 = LLT::float32();
+static const LLT F32 = LLT::integer(32);
static const LLT S64 = LLT::scalar(64);
-static const LLT F64 = LLT::float64();
+static const LLT F64 = LLT::integer(64);
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -305,7 +306,7 @@ static const LLT V10S16 = LLT::fixed_vector(10, 16);
static const LLT V12S16 = LLT::fixed_vector(12, 16);
static const LLT V16S16 = LLT::fixed_vector(16, 16);
-static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
+static const LLT V2F16 = LLT::fixed_vector(2, LLT::integer(16));
static const LLT V2BF16 = V2F16; // FIXME
static const LLT V2S32 = LLT::fixed_vector(2, 32);
@@ -3198,10 +3199,10 @@ bool AMDGPULegalizerInfo::legalizeFMad(
// TODO: Always legal with future ftz flag.
// FIXME: Do we need just output?
- if (Ty == LLT::float32() &&
+ if (Ty == F32 &&
MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
return true;
- if (Ty == LLT::float16() &&
+ if (Ty == F16 &&
MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
return true;
@@ -3753,8 +3754,6 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
Register Src1 = MI.getOperand(2).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
- const LLT F16 = LLT::float16();
- const LLT F32 = LLT::float32();
if (Ty == F32) {
auto Log = B.buildFLog2(F32, Src0, Flags);
@@ -3797,7 +3796,6 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
MachineIRBuilder &B) const {
const LLT S1 = LLT::scalar(1);
- const LLT F64 = LLT::float64();
Register Dst = MI.getOperand(0).getReg();
Register OrigSrc = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
>From c90d8ea1046df85a243a94f992a561a0e1e3975d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Mon, 16 Dec 2024 12:45:09 +0000
Subject: [PATCH 08/11] AMDGPU legalizer WIP
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 19 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1906 +++++++++--------
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 +
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 11 +-
4 files changed, 1031 insertions(+), 908 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a4239f2567146d..63cb2e6ef92b87 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2012,6 +2012,15 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned ExtOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
+ LLT SrcTy = MRI.getType(MO.getReg());
+
+ if (SrcTy.isFloat() && ExtOpcode != TargetOpcode::G_FPEXT) {
+ auto Cast = MIRBuilder.buildBitcast(SrcTy.dropType(), MO);
+ auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {Cast});
+ MO.setReg(ExtB.getReg(0));
+ return;
+ }
+
auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
MO.setReg(ExtB.getReg(0));
}
@@ -2026,8 +2035,18 @@ void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned TruncOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
+ LLT DstTy = MRI.getType(MO.getReg());
Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+
MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+ if (DstTy.isFloat() && TruncOpcode != TargetOpcode::G_FPTRUNC) {
+ auto Trunc = MIRBuilder.buildInstr(TruncOpcode, {DstTy.dropType()}, {DstExt});
+ MIRBuilder.buildBitcast(MO, Trunc);
+ MO.setReg(DstExt);
+ return;
+ }
+
MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
MO.setReg(DstExt);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c358641cdef170..ea4d9059b35f5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -26,9 +26,12 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -52,6 +55,147 @@ static cl::opt<bool> EnableNewLegality(
static constexpr unsigned MaxRegisterSize = 1024;
+
+static const LLT I1 = LLT::integer(1);
+static const LLT I8 = LLT::integer(8);
+static const LLT I16 = LLT::integer(16);
+static const LLT F16 = LLT::float16();
+static const LLT BF16 = LLT::bfloat();
+static const LLT I32 = LLT::integer(32);
+static const LLT F32 = LLT::float32();
+static const LLT I64 = LLT::integer(64);
+static const LLT F64 = LLT::float64();
+static const LLT I96 = LLT::integer(96);
+static const LLT I128 = LLT::integer(128);
+static const LLT I160 = LLT::integer(160);
+static const LLT I192 = LLT::integer(192);
+static const LLT I224 = LLT::integer(224);
+static const LLT I256 = LLT::integer(256);
+static const LLT I512 = LLT::integer(512);
+static const LLT I1024 = LLT::integer(1024);
+static const LLT MaxScalar = LLT::integer(MaxRegisterSize);
+
+static const LLT V2I8 = LLT::fixed_vector(2, I8);
+
+static const LLT V2I16 = LLT::fixed_vector(2, I16);
+static const LLT V3I16 = LLT::fixed_vector(3, I16);
+static const LLT V4I16 = LLT::fixed_vector(4, I16);
+static const LLT V6I16 = LLT::fixed_vector(6, I16);
+static const LLT V8I16 = LLT::fixed_vector(8, I16);
+static const LLT V10I16 = LLT::fixed_vector(10, I16);
+static const LLT V12I16 = LLT::fixed_vector(12, I16);
+static const LLT V16I16 = LLT::fixed_vector(16, I16);
+
+static const LLT V2F16 = LLT::fixed_vector(2, F16);
+static const LLT V4F16 = LLT::fixed_vector(4, F16);
+static const LLT V6F16 = LLT::fixed_vector(6, F16);
+static const LLT V8F16 = LLT::fixed_vector(8, F16);
+static const LLT V10F16 = LLT::fixed_vector(10, F16);
+static const LLT V12F16 = LLT::fixed_vector(12, F16);
+static const LLT V16F16 = LLT::fixed_vector(16, F16);
+
+static const LLT V2BF16 = LLT::fixed_vector(2, BF16);
+static const LLT V4BF16 = LLT::fixed_vector(4, BF16);
+static const LLT V6BF16 = LLT::fixed_vector(6, BF16);
+static const LLT V8BF16 = LLT::fixed_vector(8, BF16);
+static const LLT V10BF16 = LLT::fixed_vector(10, BF16);
+static const LLT V12BF16 = LLT::fixed_vector(12, BF16);
+static const LLT V16BF16 = LLT::fixed_vector(16, BF16);
+
+static const LLT V2I32 = LLT::fixed_vector(2, I32);
+static const LLT V3I32 = LLT::fixed_vector(3, I32);
+static const LLT V4I32 = LLT::fixed_vector(4, I32);
+static const LLT V5I32 = LLT::fixed_vector(5, I32);
+static const LLT V6I32 = LLT::fixed_vector(6, I32);
+static const LLT V7I32 = LLT::fixed_vector(7, I32);
+static const LLT V8I32 = LLT::fixed_vector(8, I32);
+static const LLT V9I32 = LLT::fixed_vector(9, I32);
+static const LLT V10I32 = LLT::fixed_vector(10, I32);
+static const LLT V11I32 = LLT::fixed_vector(11, I32);
+static const LLT V12I32 = LLT::fixed_vector(12, I32);
+static const LLT V16I32 = LLT::fixed_vector(16, I32);
+static const LLT V32I32 = LLT::fixed_vector(32, I32);
+
+static const LLT V2F32 = LLT::fixed_vector(2, F32);
+static const LLT V3F32 = LLT::fixed_vector(3, F32);
+static const LLT V4F32 = LLT::fixed_vector(4, F32);
+static const LLT V5F32 = LLT::fixed_vector(5, F32);
+static const LLT V6F32 = LLT::fixed_vector(6, F32);
+static const LLT V7F32 = LLT::fixed_vector(7, F32);
+static const LLT V8F32 = LLT::fixed_vector(8, F32);
+static const LLT V9F32 = LLT::fixed_vector(9, F32);
+static const LLT V10F32 = LLT::fixed_vector(10, F32);
+static const LLT V11F32 = LLT::fixed_vector(11, F32);
+static const LLT V12F32 = LLT::fixed_vector(12, F32);
+static const LLT V16F32 = LLT::fixed_vector(16, F32);
+static const LLT V32F32 = LLT::fixed_vector(32, F32);
+
+static const LLT V2I64 = LLT::fixed_vector(2, I64);
+static const LLT V3I64 = LLT::fixed_vector(3, I64);
+static const LLT V4I64 = LLT::fixed_vector(4, I64);
+static const LLT V5I64 = LLT::fixed_vector(5, I64);
+static const LLT V6I64 = LLT::fixed_vector(6, I64);
+static const LLT V7I64 = LLT::fixed_vector(7, I64);
+static const LLT V8I64 = LLT::fixed_vector(8, I64);
+static const LLT V16I64 = LLT::fixed_vector(16, I64);
+
+static const LLT V2F64 = LLT::fixed_vector(2, F64);
+static const LLT V3F64 = LLT::fixed_vector(3, F64);
+static const LLT V4F64 = LLT::fixed_vector(4, F64);
+static const LLT V5F64 = LLT::fixed_vector(5, F64);
+static const LLT V6F64 = LLT::fixed_vector(6, F64);
+static const LLT V7F64 = LLT::fixed_vector(7, F64);
+static const LLT V8F64 = LLT::fixed_vector(8, F64);
+static const LLT V16F64 = LLT::fixed_vector(16, F64);
+
+static const LLT V2I128 = LLT::fixed_vector(2, I128);
+static const LLT V4I128 = LLT::fixed_vector(4, I128);
+
+static std::initializer_list<LLT> AllScalarTypes = {
+ I16, F16, BF16, I32, F32, I64, F64, I96, I128, I160, I192, I224, I256, I512, I1024};
+
+static std::initializer_list<LLT> AllScalarTypes16Bit = {
+ I32, F32, I64, F64, I96, I128, I160, I192, I224, I256, I512, I1024};
+
+static std::initializer_list<LLT> AllS16Vectors{
+ V2I16, V2F16, V2BF16,
+ V4I16, V4F16, V4BF16,
+ V6I16, V6F16, V6BF16,
+ V8I16, V8F16, V8BF16,
+ V10I16, V10F16, V10BF16,
+ V12I16, V12F16, V12BF16,
+ V16I16, V16F16, V16BF16,
+ V2I128,
+ V4I128,
+};
+
+static std::initializer_list<LLT> AllS32Vectors = {
+ V2I32, V2F32,
+ V3I32, V3F32,
+ V4I32, V4F32,
+ V5I32, V5F32,
+ V6I32, V6F32,
+ V7I32, V7F32,
+ V8I32, V8F32,
+ V9I32, V9F32,
+ V10I32, V10F32,
+ V11I32, V11F32,
+ V12I32, V12F32,
+ V16I32, V16F32,
+ V32I32, V32F32,
+};
+
+static std::initializer_list<LLT> AllS64Vectors = {
+ V2I64, V2F64,
+ V3I64, V3F64,
+ V4I64, V4F64,
+ V5I64, V5F64,
+ V6I64, V6F64,
+ V7I64, V7F64,
+ V8I64, V8F64,
+ V16I64, V16F64,
+};
+
// Round the number of elements to the next power of two elements
static LLT getPow2VectorType(LLT Ty) {
unsigned NElts = Ty.getNumElements();
@@ -60,10 +204,10 @@ static LLT getPow2VectorType(LLT Ty) {
}
// Round the number of bits to the next power of two bits
-static LLT getPow2ScalarType(LLT Ty) {
+static LLT getPow2IntegerType(LLT Ty) {
unsigned Bits = Ty.getSizeInBits();
unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
- return LLT::scalar(Pow2Bits);
+ return LLT::integer(Pow2Bits);
}
/// \returns true if this is an odd sized vector which should widen by adding an
@@ -161,16 +305,16 @@ static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
static LLT getBufferRsrcScalarType(const LLT Ty) {
if (!Ty.isVector())
- return LLT::scalar(128);
+ return I128;
const ElementCount NumElems = Ty.getElementCount();
- return LLT::vector(NumElems, LLT::scalar(128));
+ return LLT::vector(NumElems, I128);
}
static LLT getBufferRsrcRegisterType(const LLT Ty) {
if (!Ty.isVector())
- return LLT::fixed_vector(4, LLT::scalar(32));
+ return V4I32;
const unsigned NumElems = Ty.getElementCount().getFixedValue();
- return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
+ return LLT::fixed_vector(NumElems * 4, I32);
}
static LLT getBitcastRegisterType(const LLT Ty) {
@@ -179,10 +323,10 @@ static LLT getBitcastRegisterType(const LLT Ty) {
if (Size <= 32) {
// <2 x s8> -> s16
// <4 x s8> -> s32
- return LLT::scalar(Size);
+ return LLT::integer(Size);
}
- return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
+ return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), I32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
@@ -198,7 +342,7 @@ static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
unsigned Size = Ty.getSizeInBits();
assert(Size % 32 == 0);
return std::pair(
- TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
+ TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), I32));
};
}
@@ -279,79 +423,10 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
};
}
-static const LLT S1 = LLT::scalar(1);
-static const LLT S8 = LLT::scalar(8);
-static const LLT S16 = LLT::scalar(16);
-static const LLT F16 = LLT::integer(16);
-static const LLT S32 = LLT::scalar(32);
-static const LLT F32 = LLT::integer(32);
-static const LLT S64 = LLT::scalar(64);
-static const LLT F64 = LLT::integer(64);
-static const LLT S96 = LLT::scalar(96);
-static const LLT S128 = LLT::scalar(128);
-static const LLT S160 = LLT::scalar(160);
-static const LLT S192 = LLT::scalar(192);
-static const LLT S224 = LLT::scalar(224);
-static const LLT S256 = LLT::scalar(256);
-static const LLT S512 = LLT::scalar(512);
-static const LLT S1024 = LLT::scalar(1024);
-static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
-
-static const LLT V2S8 = LLT::fixed_vector(2, 8);
-static const LLT V2S16 = LLT::fixed_vector(2, 16);
-static const LLT V4S16 = LLT::fixed_vector(4, 16);
-static const LLT V6S16 = LLT::fixed_vector(6, 16);
-static const LLT V8S16 = LLT::fixed_vector(8, 16);
-static const LLT V10S16 = LLT::fixed_vector(10, 16);
-static const LLT V12S16 = LLT::fixed_vector(12, 16);
-static const LLT V16S16 = LLT::fixed_vector(16, 16);
-
-static const LLT V2F16 = LLT::fixed_vector(2, LLT::integer(16));
-static const LLT V2BF16 = V2F16; // FIXME
-
-static const LLT V2S32 = LLT::fixed_vector(2, 32);
-static const LLT V3S32 = LLT::fixed_vector(3, 32);
-static const LLT V4S32 = LLT::fixed_vector(4, 32);
-static const LLT V5S32 = LLT::fixed_vector(5, 32);
-static const LLT V6S32 = LLT::fixed_vector(6, 32);
-static const LLT V7S32 = LLT::fixed_vector(7, 32);
-static const LLT V8S32 = LLT::fixed_vector(8, 32);
-static const LLT V9S32 = LLT::fixed_vector(9, 32);
-static const LLT V10S32 = LLT::fixed_vector(10, 32);
-static const LLT V11S32 = LLT::fixed_vector(11, 32);
-static const LLT V12S32 = LLT::fixed_vector(12, 32);
-static const LLT V16S32 = LLT::fixed_vector(16, 32);
-static const LLT V32S32 = LLT::fixed_vector(32, 32);
-
-static const LLT V2S64 = LLT::fixed_vector(2, 64);
-static const LLT V3S64 = LLT::fixed_vector(3, 64);
-static const LLT V4S64 = LLT::fixed_vector(4, 64);
-static const LLT V5S64 = LLT::fixed_vector(5, 64);
-static const LLT V6S64 = LLT::fixed_vector(6, 64);
-static const LLT V7S64 = LLT::fixed_vector(7, 64);
-static const LLT V8S64 = LLT::fixed_vector(8, 64);
-static const LLT V16S64 = LLT::fixed_vector(16, 64);
-
-static const LLT V2S128 = LLT::fixed_vector(2, 128);
-static const LLT V4S128 = LLT::fixed_vector(4, 128);
-
-static std::initializer_list<LLT> AllScalarTypes = {
- S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
-
-static std::initializer_list<LLT> AllS16Vectors{
- V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
-
-static std::initializer_list<LLT> AllS32Vectors = {
- V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
- V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
-
-static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
- V6S64, V7S64, V8S64, V16S64};
-
// Checks whether a type is in the list of legal register types.
static bool isRegisterClassType(LLT Ty) {
if (Ty.isPointerOrPointerVector())
- Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
+ Ty = Ty.changeElementType(LLT::integer(Ty.getScalarSizeInBits()));
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
@@ -598,14 +673,13 @@ static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
if (!PointerTy.isVector()) {
// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
- const LLT S32 = LLT::scalar(32);
Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
std::array<Register, 4> VectorElems;
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
for (unsigned I = 0; I < NumParts; ++I)
VectorElems[I] =
- B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
+ B.buildExtractVectorElementConstant(I32, VectorReg, I).getReg(0);
B.buildMergeValues(MO, VectorElems);
MO.setReg(VectorReg);
return VectorTy;
@@ -634,7 +708,7 @@ static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
SmallVector<Register, 4> PointerParts;
const unsigned NumParts = PointerTy.getSizeInBits() / 32;
- auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
+ auto Unmerged = B.buildUnmerge(I32, Pointer);
for (unsigned I = 0; I < NumParts; ++I)
PointerParts.push_back(Unmerged.getReg(I));
return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
@@ -688,35 +762,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
const std::initializer_list<LLT> FPTypesBase = {
- S32, S64
+ F32, F64
};
const std::initializer_list<LLT> FPTypes16 = {
- S32, S64, S16
+ F32, F64, F16, BF16
};
const std::initializer_list<LLT> FPTypesPK16 = {
- S32, S64, S16, V2S16
+ F32, F64, F16, BF16, V2F16, V2BF16
};
- const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
+ const LLT MinScalarFPTy = ST.has16BitInsts() ? F16 : F32;
// s1 for VCC branches, s32 for SCC branches.
- getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({I1, I32});
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
- .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
+ .legalFor({I32, F32, I64, F64, V2I16, V2F16, V2BF16, I16, F16, BF16, V4I16, V4F16, V4BF16, I1, I128, I256})
.legalFor(AllS32Vectors)
.legalFor(AllS64Vectors)
.legalFor(AddrSpaces64)
.legalFor(AddrSpaces32)
.legalFor(AddrSpaces128)
.legalIf(isPointer(0))
- .clampScalar(0, S16, S256)
+ .clampScalar(0, I16, I256)
.widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16)
+ .clampMaxNumElements(0, I32, 16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.scalarize(0);
@@ -724,60 +798,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Full set of gfx9 features.
if (ST.hasScalarAddSub64()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S64, S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I64, I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ .maxScalar(0, I32);
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ .maxScalar(0, I32);
}
if (ST.hasScalarSMulU64()) {
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S64, S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I64, I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
} else {
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
}
assert(ST.hasMad64_32());
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
- .legalFor({S32, S16, V2S16}) // Clamp modifier
- .minScalarOrElt(0, S16)
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I32, I16, V2I16}) // Clamp modifier
+ .minScalarOrElt(0, I16)
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.lower();
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16})
- .minScalar(0, S16)
+ .legalFor({I32, I16})
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32)
+ .maxScalar(0, I32)
.scalarize(0);
getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32, S16})
+ .legalFor({I32, I16})
.scalarize(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.widenScalarToNextMultipleOf(0, 32)
.custom();
assert(ST.hasMad64_32());
@@ -785,8 +859,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Technically the saturating operations require clamp bit support, but this
// was introduced at the same time as 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .legalFor({S32, S16}) // Clamp modifier
- .minScalar(0, S16)
+ .legalFor({I32, I16}) // Clamp modifier
+ .minScalar(0, I16)
.scalarize(0)
.widenScalarToNextPow2(0, 16)
.lower();
@@ -794,37 +868,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// We're just lowering this, but it helps get a better result to try to
// coerce to the desired type first.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32})
+ .legalFor({I32})
.widenScalarToNextMultipleOf(0, 32)
- .clampScalar(0, S32, S32)
+ .clampScalar(0, I32, I32)
.scalarize(0);
auto &Mul = getActionDefinitionsBuilder(G_MUL)
- .legalFor({S32})
+ .legalFor({I32})
.scalarize(0)
- .minScalar(0, S32)
+ .minScalar(0, I32)
.widenScalarToNextMultipleOf(0, 32);
if (ST.hasMad64_32())
Mul.custom();
else
- Mul.maxScalar(0, S32);
+ Mul.maxScalar(0, I32);
if (ST.hasIntClamp()) {
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .legalFor({S32}) // Clamp modifier.
+ .legalFor({I32}) // Clamp modifier.
.scalarize(0)
- .minScalarOrElt(0, S32)
+ .minScalarOrElt(0, I32)
.lower();
} else {
// Clamp bit support was added in VI, along with 16-bit operations.
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
@@ -832,26 +906,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: DAG expansion gets better results. The widening uses the smaller
// range values and goes for the min/max lowering directly.
getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
getActionDefinitionsBuilder(
{G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
- .customFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .customFor({I32, I64})
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
- .legalFor({S32})
- .maxScalar(0, S32);
+ .legalFor({I32})
+ .maxScalar(0, I32);
if (ST.hasVOP3PInsts()) {
Mulh
- .clampMaxNumElements(0, S8, 2)
- .lowerFor({V2S8});
+ .clampMaxNumElements(0, I8, 2)
+ .lowerFor({V2I8});
}
Mulh
@@ -861,8 +935,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Report legal for any types we can handle anywhere. For the cases only legal
// on the SALU, RegBankSelect will be able to re-legalize.
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
- .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
- .clampScalar(0, S32, S64)
+ .legalFor({I32, I1, I64, V2I32, I16, V2I16, V4I16})
+ .clampScalar(0, I32, I64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
.widenScalarToNextPow2(0)
@@ -870,8 +944,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(
{G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}, {S32, S32}})
- .clampScalar(0, S32, S32)
+ .legalFor({{I32, I1}, {I32, I32}})
+ .clampScalar(0, I32, I32)
.scalarize(0);
getActionDefinitionsBuilder(G_BITCAST)
@@ -880,40 +954,42 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64, S16, GlobalPtr,
+ .legalFor({I1, I32, I64, I16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
.legalIf(isPointer(0))
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64, S16})
- .clampScalar(0, S16, S64);
+ .legalFor({F32, F64, F16, BF16})
+ .clampScalar(0, F16, F64);
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
.legalIf(isRegisterClassType(0))
// s1 and s16 are special cases because they have legal operations on
// them, but don't really occupy registers in the normal way.
- .legalFor({S1, S16})
- .clampNumElements(0, V16S32, V32S32)
+ .legalFor({I1, I16, F16, BF16})
+ .clampNumElements(0, V16I32, V32I32)
+ .clampNumElements(0, V16F32, V32F32)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampScalarOrElt(0, S32, MaxScalar)
+ .clampScalarOrElt(0, I32, MaxScalar)
.widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16);
+ .clampMaxNumElements(0, I32, 16)
+ .clampMaxNumElements(0, F32, 16);
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
- .legalFor({{PrivatePtr, S32}});
+ .legalFor({{PrivatePtr, I32}, {PrivatePtr, F32}});
getActionDefinitionsBuilder(G_STACKSAVE)
.customFor({PrivatePtr});
getActionDefinitionsBuilder(G_STACKRESTORE)
.legalFor({PrivatePtr});
- getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
+ getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({I64});
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
@@ -923,25 +999,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
- .legalFor({S32, S64});
+ .legalFor({F32, F64});
auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
- .customFor({S32, S64});
+ .customFor({F32, F64});
auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
- .customFor({S32, S64});
+ .customFor({F32, F64});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts())
- FPOpActions.legalFor({S16, V2S16});
+ FPOpActions.legalFor({F16, V2F16});
else
- FPOpActions.legalFor({S16});
+ FPOpActions.legalFor({F16});
- TrigActions.customFor({S16});
- FDIVActions.customFor({S16});
+ TrigActions.customFor({F16});
+ FDIVActions.customFor({F16});
}
if (ST.hasPackedFP32Ops()) {
- FPOpActions.legalFor({V2S32});
- FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
+ FPOpActions.legalFor({V2F32});
+ FPOpActions.clampMaxNumElementsStrict(0, F32, 2);
}
auto &MinNumMaxNum = getActionDefinitionsBuilder({
@@ -950,154 +1026,156 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampMaxNumElements(0, S16, 2)
- .clampScalar(0, S16, S64)
+ .clampMaxNumElements(0, F16, 2)
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else if (ST.has16BitInsts()) {
MinNumMaxNum.customFor(FPTypes16)
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else {
MinNumMaxNum.customFor(FPTypesBase)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, F32, F64)
.scalarize(0);
}
if (ST.hasVOP3PInsts())
- FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
-
+ FPOpActions.clampMaxNumElementsStrict(0, F16, 2);
+
FPOpActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
TrigActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
FDIVActions
.scalarize(0)
- .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ .clampScalar(0, ST.has16BitInsts() ? F16 : F32, F64);
getActionDefinitionsBuilder({G_FNEG, G_FABS})
.legalFor(FPTypesPK16)
- .clampMaxNumElementsStrict(0, S16, 2)
+ .clampMaxNumElementsStrict(0, F16, 2)
.scalarize(0)
- .clampScalar(0, S16, S64);
+ .clampScalar(0, F16, F64);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S16})
- .customFor({S32, S64})
+ .legalFor({F16})
+ .customFor({F32, F64})
.scalarize(0)
.unsupported();
getActionDefinitionsBuilder(G_FFLOOR)
- .legalFor({S32, S64, S16})
+ .legalFor({F32, F64, F16})
.scalarize(0)
- .clampScalar(0, S16, S64);
+ .clampScalar(0, F16, F64);
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
- .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
+ .legalFor({{F32, I32}, {F64, I32}, {F16, I16}})
.scalarize(0)
- .maxScalarIf(typeIs(0, S16), 1, S16)
- .clampScalar(1, S32, S32)
+ .maxScalarIf(typeIs(0, F16), 1, I16)
+ .clampScalar(1, I32, I32)
.lower();
getActionDefinitionsBuilder(G_FFREXP)
- .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
+ .customFor({{F32, F32}, {F64, F32}, {F16, F16}, {F16, F32}})
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
- .customFor({S32, S64, S16})
+ .customFor({F32, F64, F16})
.scalarize(0)
.unsupported();
if (ST.hasFractBug()) {
getActionDefinitionsBuilder(G_FFLOOR)
- .customFor({S64})
- .legalFor({S32, S64})
+ .customFor({F64})
+ .legalFor({F32, F64})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
} else {
getActionDefinitionsBuilder(G_FFLOOR)
- .legalFor({S32, S64})
+ .legalFor({F32, F64})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
}
getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
- .legalFor({{S32, S32}, {S64, S32}})
+ .legalFor({{F32, I32}, {F64, I32}})
.scalarize(0)
- .clampScalar(0, S32, S64)
- .clampScalar(1, S32, S32)
+ .clampScalar(0, F32, F64)
+ .clampScalar(1, I32, I32)
.lower();
getActionDefinitionsBuilder(G_FFREXP)
- .customFor({{S32, S32}, {S64, S32}})
+ .customFor({{F32, F32}, {F64, F32}})
.scalarize(0)
- .minScalar(0, S32)
- .clampScalar(1, S32, S32)
+ .minScalar(0, I32)
+ .clampScalar(1, I32, I32)
.lower();
}
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
if (ST.hasCvtPkF16F32Inst())
FPTruncActions.legalFor(
- {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
+ {{F32, F64}, {F16, F32}, {V2F16, V2F32}, {V2F16, V2F64}});
else
- FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
+ FPTruncActions.legalFor({{F32, F64}, {F16, F32}});
+ FPTruncActions.customFor({{BF16, F32}});
FPTruncActions.scalarize(0).lower();
getActionDefinitionsBuilder(G_FPEXT)
- .legalFor({{S64, S32}, {S32, S16}})
- .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
+ .legalFor({{F64, F32}, {F32, F16}})
+ .customFor({{F32, BF16}})
+ .narrowScalarFor({{I64, I16}}, changeTo(0, I32))
.scalarize(0);
auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
if (ST.has16BitInsts()) {
FSubActions
// Use actual fsub instruction
- .legalFor({S32, S16})
+ .legalFor({F32, F16})
// Must use fadd + fneg
- .lowerFor({S64, V2S16});
+ .lowerFor({F64, V2F16});
} else {
FSubActions
// Use actual fsub instruction
- .legalFor({S32})
+ .legalFor({F32})
// Must use fadd + fneg
- .lowerFor({S64, S16, V2S16});
+ .lowerFor({F64, F16, V2F16});
}
FSubActions
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, F32, F64);
// Whether this is legal depends on the floating point mode for the function.
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
if (ST.hasMadF16() && ST.hasMadMacF32Insts())
- FMad.customFor({S32, S16});
+ FMad.customFor({F32, F16});
else if (ST.hasMadMacF32Insts())
- FMad.customFor({S32});
+ FMad.customFor({F32});
else if (ST.hasMadF16())
- FMad.customFor({S16});
+ FMad.customFor({F16});
FMad.scalarize(0)
.lower();
auto &FRem = getActionDefinitionsBuilder(G_FREM);
if (ST.has16BitInsts()) {
- FRem.customFor({S16, S32, S64});
+ FRem.customFor({F16, F32, F64});
} else {
- FRem.minScalar(0, S32)
- .customFor({S32, S64});
+ FRem.minScalar(0, F32)
+ .customFor({F32, F64});
}
FRem.scalarize(0);
// TODO: Do we need to clamp maximum bitwidth?
getActionDefinitionsBuilder(G_TRUNC)
.legalIf(isScalar(0))
- .legalFor({{V2S16, V2S32}})
- .clampMaxNumElements(0, S16, 2)
+ .legalFor({{V2F16, V2F32}})
+ .clampMaxNumElements(0, F16, 2)
// Avoid scalarizing in cases that should be truly illegal. In unresolvable
// situations (like an invalid implicit use), we don't want to infinite loop
// in the legalizer.
@@ -1105,45 +1183,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.alwaysLegal();
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
- .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
- {S32, S1}, {S64, S1}, {S16, S1}})
+ .legalFor({{I64, I32}, {I32, I16}, {I64, I16},
+ {I32, I1}, {I64, I1}, {I16, I1}})
.scalarize(0)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(1, 32);
// TODO: Split s1->s64 during regbankselect for VALU.
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
- .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
- .lowerIf(typeIs(1, S1))
- .customFor({{S32, S64}, {S64, S64}});
+ .legalFor({{F32, I32}, {F64, I32}, {F16, I32}})
+ .lowerIf(typeIs(1, I1))
+ .customFor({{F32, I64}, {F64, I64}});
if (ST.has16BitInsts())
- IToFP.legalFor({{S16, S16}});
- IToFP.clampScalar(1, S32, S64)
- .minScalar(0, S32)
+ IToFP.legalFor({{F16, I16}});
+ IToFP.clampScalar(1, I32, I64)
+ .minScalar(0, I32)
.scalarize(0)
.widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S32}, {S64, S64}})
- .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
+ .legalFor({{I32, F32}, {I32, F64}, {I32, F16}})
+ .customFor({{I64, F32}, {I64, F64}})
+ .narrowScalarFor({{I64, I16}}, changeTo(0, I32));
if (ST.has16BitInsts())
- FPToI.legalFor({{S16, S16}});
+ FPToI.legalFor({{I16, F16}});
else
- FPToI.minScalar(1, S32);
+ FPToI.minScalar(1, I32);
- FPToI.minScalar(0, S32)
+ FPToI.minScalar(0, I32)
.widenScalarToNextPow2(0, 32)
.scalarize(0)
.lower();
getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0)
.lower();
getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
- .legalFor({S16, S32})
+ .legalFor({F16, F32})
.scalarize(0)
.lower();
@@ -1153,28 +1231,28 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
- .clampScalar(0, S16, S64)
+ .clampScalar(0, F16, F64)
.scalarize(0)
.lower();
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S16, S32, S64})
- .clampScalar(0, S16, S64)
+ .legalFor({F16, F32, F64})
+ .clampScalar(0, F16, F64)
.scalarize(0);
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .legalFor({F32, F64})
+ .clampScalar(0, F32, F64)
.scalarize(0);
} else {
getActionDefinitionsBuilder(
{G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
- .legalFor({S32})
- .customFor({S64})
- .clampScalar(0, S32, S64)
+ .legalFor({F32})
+ .customFor({F64})
+ .clampScalar(0, F32, F64)
.scalarize(0);
}
@@ -1185,7 +1263,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
- .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
+ .legalIf(all(sameSize(0, 1), typeInSet(1, {I64, I32})))
.scalarSameSizeAs(1, 0)
.scalarize(0);
@@ -1202,79 +1280,79 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
// bank.
.legalForCartesianProduct(
- {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
+ {I1}, {I32, I64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
.legalForCartesianProduct(
- {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
+ {I32}, {I32, I64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
if (ST.has16BitInsts()) {
- CmpBuilder.legalFor({{S1, S16}});
+ CmpBuilder.legalFor({{I1, I16}});
}
CmpBuilder
.widenScalarToNextPow2(1)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, I32, I64)
.scalarize(0)
- .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
+ .legalIf(all(typeInSet(0, {I1, I32}), isPointer(1)));
auto &FCmpBuilder =
getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
- {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
+ {I1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
if (ST.hasSALUFloatInsts())
- FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
+ FCmpBuilder.legalForCartesianProduct({I32}, {F16, F32});
FCmpBuilder
.widenScalarToNextPow2(1)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, F32, F64)
.scalarize(0);
// FIXME: fpow has a selection pattern that should move to custom lowering.
auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
if (ST.has16BitInsts())
- ExpOps.customFor({{S32}, {S16}});
+ ExpOps.customFor({{F32}, {F16}});
else
- ExpOps.customFor({S32});
- ExpOps.clampScalar(0, MinScalarFPTy, S32)
+ ExpOps.customFor({F32});
+ ExpOps.clampScalar(0, MinScalarFPTy, F32)
.scalarize(0);
getActionDefinitionsBuilder(G_FPOWI)
- .clampScalar(0, MinScalarFPTy, S32)
+ .clampScalar(0, MinScalarFPTy, F32)
.lower();
auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
- Log2Ops.customFor({S32});
+ Log2Ops.customFor({F32});
if (ST.has16BitInsts())
- Log2Ops.legalFor({S16});
+ Log2Ops.legalFor({F16});
else
- Log2Ops.customFor({S16});
+ Log2Ops.customFor({F16});
Log2Ops.scalarize(0)
.lower();
auto &LogOps =
getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
- LogOps.customFor({S32, S16});
- LogOps.clampScalar(0, MinScalarFPTy, S32)
+ LogOps.customFor({F32, F16});
+ LogOps.clampScalar(0, MinScalarFPTy, F32)
.scalarize(0);
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
+ .legalFor({{I32, I32}, {I32, I64}})
+ .clampScalar(0, I32, I32)
.widenScalarToNextPow2(1, 32)
- .clampScalar(1, S32, S64)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32);
// If no 16 bit instr is available, lower into different instructions.
if (ST.has16BitInsts())
getActionDefinitionsBuilder(G_IS_FPCLASS)
- .legalForCartesianProduct({S1}, FPTypes16)
+ .legalForCartesianProduct({I1}, FPTypes16)
.widenScalarToNextPow2(1)
.scalarize(0)
.lower();
else
getActionDefinitionsBuilder(G_IS_FPCLASS)
- .legalForCartesianProduct({S1}, FPTypesBase)
- .lowerFor({S1, S16})
+ .legalForCartesianProduct({I1}, FPTypesBase)
+ .lowerFor({I1, I16})
.widenScalarToNextPow2(1)
.scalarize(0)
.lower();
@@ -1284,26 +1362,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// bitwidth.
getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
.scalarize(0)
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32)
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
+ .legalFor({{I32, I32}, {I32, I64}})
.customIf(scalarNarrowerThan(1, 32))
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
+ .legalFor({{I32, I32}, {I32, I64}})
+ .clampScalar(0, I32, I32)
+ .clampScalar(1, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
@@ -1311,52 +1389,52 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64)
+ .legalFor({I32, I64})
+ .clampScalar(0, I32, I64)
.scalarize(0)
.widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
- .legalFor({S16, S32, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({I16, I32, V2I16})
+ .clampMaxNumElementsStrict(0, I16, 2)
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
- .clampScalar(0, S16, S32)
+ .clampScalar(0, I16, I32)
.scalarize(0);
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElements(0, S16, 2)
- .minScalar(0, S16)
+ .legalFor({I32, I16, V2I16})
+ .clampMaxNumElements(0, I16, 2)
+ .minScalar(0, I16)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32, S16})
+ .legalFor({I32, I16})
.widenScalarToNextPow2(0)
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
}
} else {
// TODO: Should have same legality without v_perm_b32
getActionDefinitionsBuilder(G_BSWAP)
- .legalFor({S32})
+ .legalFor({I32})
.lowerIf(scalarNarrowerThan(0, 32))
// FIXME: Fixing non-power-of-2 before clamp is workaround for
// narrowScalar limitation.
.widenScalarToNextPow2(0)
- .maxScalar(0, S32)
+ .maxScalar(0, I32)
.scalarize(0)
.lower();
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
- .legalFor({S32})
- .minScalar(0, S32)
+ .legalFor({I32})
+ .minScalar(0, I32)
.widenScalarToNextPow2(0)
.scalarize(0)
.lower();
@@ -1364,8 +1442,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_INTTOPTR)
// List the common cases
- .legalForCartesianProduct(AddrSpaces64, {S64})
- .legalForCartesianProduct(AddrSpaces32, {S32})
+ .legalForCartesianProduct(AddrSpaces64, {I64})
+ .legalForCartesianProduct(AddrSpaces32, {I32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
@@ -1380,18 +1458,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_PTRTOINT)
// List the common cases
- .legalForCartesianProduct(AddrSpaces64, {S64})
- .legalForCartesianProduct(AddrSpaces32, {S32})
+ .legalForCartesianProduct(AddrSpaces64, {I64})
+ .legalForCartesianProduct(AddrSpaces32, {I32})
.scalarize(0)
// Accept any address space as long as the size matches
.legalIf(sameSize(0, 1))
.widenScalarIf(smallerThan(0, 1),
[](const LegalityQuery &Query) {
return std::pair(
- 0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ 0, LLT::integer(Query.Types[1].getSizeInBits()));
})
.narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
- return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ return std::pair(0, LLT::integer(Query.Types[1].getSizeInBits()));
});
getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
@@ -1444,32 +1522,50 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &Actions = getActionDefinitionsBuilder(Op);
// Explicitly list some common cases.
// TODO: Does this help compile time at all?
- Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
- {V2S32, GlobalPtr, V2S32, GlobalAlign32},
- {V4S32, GlobalPtr, V4S32, GlobalAlign32},
- {S64, GlobalPtr, S64, GlobalAlign32},
- {V2S64, GlobalPtr, V2S64, GlobalAlign32},
- {V2S16, GlobalPtr, V2S16, GlobalAlign32},
- {S32, GlobalPtr, S8, GlobalAlign8},
- {S32, GlobalPtr, S16, GlobalAlign16},
-
- {S32, LocalPtr, S32, 32},
- {S64, LocalPtr, S64, 32},
- {V2S32, LocalPtr, V2S32, 32},
- {S32, LocalPtr, S8, 8},
- {S32, LocalPtr, S16, 16},
- {V2S16, LocalPtr, S32, 32},
-
- {S32, PrivatePtr, S32, 32},
- {S32, PrivatePtr, S8, 8},
- {S32, PrivatePtr, S16, 16},
- {V2S16, PrivatePtr, S32, 32},
-
- {S32, ConstantPtr, S32, GlobalAlign32},
- {V2S32, ConstantPtr, V2S32, GlobalAlign32},
- {V4S32, ConstantPtr, V4S32, GlobalAlign32},
- {S64, ConstantPtr, S64, GlobalAlign32},
- {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
+ Actions.legalForTypesWithMemDesc({{I32, GlobalPtr, I32, GlobalAlign32},
+ {F32, GlobalPtr, F32, GlobalAlign32},
+
+ {V2I32, GlobalPtr, V2I32, GlobalAlign32},
+ {V2F32, GlobalPtr, V2F32, GlobalAlign32},
+
+ {V4I32, GlobalPtr, V4I32, GlobalAlign32},
+ {V4F32, GlobalPtr, V4F32, GlobalAlign32},
+
+ {I64, GlobalPtr, I64, GlobalAlign32},
+ {F64, GlobalPtr, F64, GlobalAlign32},
+
+ {V2I64, GlobalPtr, V2I64, GlobalAlign32},
+ {V2F64, GlobalPtr, V2F64, GlobalAlign32},
+ {V2I16, GlobalPtr, V2I16, GlobalAlign32},
+ {V2F16, GlobalPtr, V2F16, GlobalAlign32},
+ {V2BF16, GlobalPtr, V2BF16, GlobalAlign32},
+
+ {I32, GlobalPtr, I8, GlobalAlign8},
+ {I32, GlobalPtr, I16, GlobalAlign16},
+
+ {I32, LocalPtr, I32, 32},
+ {F32, LocalPtr, F32, 32},
+ {I64, LocalPtr, I64, 32},
+ {F64, LocalPtr, F64, 32},
+ {V2I32, LocalPtr, V2I32, 32},
+ {V2F32, LocalPtr, V2F32, 32},
+ {I32, LocalPtr, I8, 8},
+ {I32, LocalPtr, I16, 16},
+ {V2I16, LocalPtr, I32, 32},
+
+ {I32, PrivatePtr, I32, 32},
+ {F32, PrivatePtr, F32, 32},
+ {I32, PrivatePtr, I8, 8},
+ {I32, PrivatePtr, I16, 16},
+ {V2I16, PrivatePtr, I32, 32},
+
+ {I32, ConstantPtr, I32, GlobalAlign32},
+ {F32, ConstantPtr, F32, GlobalAlign32},
+ {V2I32, ConstantPtr, V2I32, GlobalAlign32},
+ {V4I32, ConstantPtr, V4I32, GlobalAlign32},
+ {I64, ConstantPtr, I64, GlobalAlign32},
+ {F64, ConstantPtr, F64, GlobalAlign32},
+ {V2I32, ConstantPtr, V2I32, GlobalAlign32}});
Actions.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
@@ -1531,16 +1627,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Split extloads.
if (DstSize > MemSize)
- return std::pair(0, LLT::scalar(MemSize));
+ return std::pair(0, LLT::integer(MemSize));
unsigned MaxSize = maxSizeForAddrSpace(
ST, PtrTy.getAddressSpace(), Op == G_LOAD,
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
if (MemSize > MaxSize)
- return std::pair(0, LLT::scalar(MaxSize));
+ return std::pair(0, LLT::integer(MaxSize));
uint64_t Align = Query.MMODescrs[0].AlignInBits;
- return std::pair(0, LLT::scalar(Align));
+ return std::pair(0, LLT::integer(Align));
})
.fewerElementsIf(
[=](const LegalityQuery &Query) -> bool {
@@ -1603,8 +1699,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::pair(0, EltTy);
})
- .minScalar(0, S32)
- .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
+ .minScalar(0, I32)
+ .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, I32))
.widenScalarToNextPow2(0)
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
.lower();
@@ -1612,14 +1708,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Unaligned accesses not lowered.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
- {S32, GlobalPtr, S16, 2 * 8},
- {S32, LocalPtr, S8, 8},
- {S32, LocalPtr, S16, 16},
- {S32, PrivatePtr, S8, 8},
- {S32, PrivatePtr, S16, 16},
- {S32, ConstantPtr, S8, 8},
- {S32, ConstantPtr, S16, 2 * 8}})
+ .legalForTypesWithMemDesc({{I32, GlobalPtr, I8, 8},
+ {I32, GlobalPtr, I16, 2 * 8},
+ {I32, LocalPtr, I8, 8},
+ {I32, LocalPtr, I16, 16},
+ {I32, PrivatePtr, I8, 8},
+ {I32, PrivatePtr, I16, 16},
+ {I32, ConstantPtr, I8, 8},
+ {I32, ConstantPtr, I16, 2 * 8}})
.legalIf(
[=](const LegalityQuery &Query) -> bool {
return isLoadStoreLegal(ST, Query);
@@ -1627,7 +1723,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAddressSpace()) {
ExtLoads.legalForTypesWithMemDesc(
- {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
+ {{I32, FlatPtr, I8, 8}, {I32, FlatPtr, I16, 16}});
}
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
@@ -1637,7 +1733,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// inserting addrspacecasts.
ExtLoads.customIf(typeIs(1, Constant32Ptr));
- ExtLoads.clampScalar(0, S32, S32)
+ ExtLoads.clampScalar(0, I32, I32)
.widenScalarToNextPow2(0)
.lower();
@@ -1646,35 +1742,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
- .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
- {S64, GlobalPtr}, {S64, LocalPtr},
- {S32, RegionPtr}, {S64, RegionPtr}});
+ .legalFor({{I32, GlobalPtr}, {I32, LocalPtr},
+ {I64, GlobalPtr}, {I64, LocalPtr},
+ {I32, RegionPtr}, {I64, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
- Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
+ Atomics.legalFor({{I32, FlatPtr}, {I64, FlatPtr}});
}
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
- Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ Atomic.legalFor({{F32, LocalPtr}, {F32, RegionPtr}});
if (ST.hasLdsAtomicAddF64())
- Atomic.legalFor({{S64, LocalPtr}});
+ Atomic.legalFor({{F64, LocalPtr}});
if (ST.hasAtomicDsPkAdd16Insts())
Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
- Atomic.legalFor({{S32, GlobalPtr}});
+ Atomic.legalFor({{F32, GlobalPtr}});
if (ST.hasFlatAtomicFaddF32Inst())
- Atomic.legalFor({{S32, FlatPtr}});
+ Atomic.legalFor({{F32, FlatPtr}});
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
Atomic.legalFor({
- {S32, GlobalPtr},
- {S64, GlobalPtr},
- {S64, FlatPtr}
+ {F32, GlobalPtr},
+ {F64, GlobalPtr},
+ {F64, FlatPtr}
});
}
@@ -1705,40 +1801,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
- .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
- {S32, FlatPtr}, {S64, FlatPtr}})
- .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
- {S32, RegionPtr}, {S64, RegionPtr}});
+ .customFor({{I32, GlobalPtr}, {I64, GlobalPtr},
+ {I32, FlatPtr}, {I64, FlatPtr}})
+ .legalFor({{I32, LocalPtr}, {I64, LocalPtr},
+ {I32, RegionPtr}, {I64, RegionPtr}});
// TODO: Pointer types, any 32-bit or 64-bit vector
// Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
- .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
+ .legalForCartesianProduct({I16, F16, BF16, I32, F32, I64, F64, V2I32, V2F32, V2I16, V2F16, V2BF16, V4I16, V4F16, V4BF16, GlobalPtr,
LocalPtr, FlatPtr, PrivatePtr,
LLT::fixed_vector(2, LocalPtr),
LLT::fixed_vector(2, PrivatePtr)},
- {S1, S32})
- .clampScalar(0, S16, S64)
+ {I1, I32})
+ .clampScalar(0, I16, I64)
.scalarize(1)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
- .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, I32, 2)
.clampMaxNumElements(0, LocalPtr, 2)
.clampMaxNumElements(0, PrivatePtr, 2)
.scalarize(0)
.widenScalarToNextPow2(0)
- .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
+ .legalIf(all(isPointer(0), typeInSet(1, {I1, I32})));
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
// be more flexible with the shift amount type.
auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
- .legalFor({{S32, S32}, {S64, S32}});
+ .legalFor({{I32, I32}, {I64, I32}});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts()) {
- Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
- .clampMaxNumElements(0, S16, 2);
+ Shifts.legalFor({{I16, I16}, {V2I16, V2I16}})
+ .clampMaxNumElements(0, I16, 2);
} else
- Shifts.legalFor({{S16, S16}});
+ Shifts.legalFor({{I16, I16}});
// TODO: Support 16-bit shift amounts for all types
Shifts.widenScalarIf(
@@ -1749,26 +1845,26 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT AmountTy = Query.Types[1];
return ValTy.getSizeInBits() <= 16 &&
AmountTy.getSizeInBits() < 16;
- }, changeTo(1, S16));
- Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
- Shifts.clampScalar(1, S32, S32);
+ }, changeTo(1, I16));
+ Shifts.maxScalarIf(typeIs(0, I16), 1, I16);
+ Shifts.clampScalar(1, I32, I32);
Shifts.widenScalarToNextPow2(0, 16);
- Shifts.clampScalar(0, S16, S64);
+ Shifts.clampScalar(0, I16, I64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
- .minScalar(0, S16)
+ .minScalar(0, I16)
.scalarize(0)
.lower();
} else {
// Make sure we legalize the shift amount type first, as the general
// expansion for the shifted type will produce much worse code if it hasn't
// been truncated already.
- Shifts.clampScalar(1, S32, S32);
+ Shifts.clampScalar(1, I32, I32);
Shifts.widenScalarToNextPow2(0, 32);
- Shifts.clampScalar(0, S32, S64);
+ Shifts.clampScalar(0, I32, I64);
getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
- .minScalar(0, S32)
+ .minScalar(0, I32)
.scalarize(0)
.lower();
}
@@ -1820,10 +1916,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
VecTypeIdx,
LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
})
- .clampScalar(EltTypeIdx, S32, S64)
- .clampScalar(VecTypeIdx, S32, S64)
- .clampScalar(IdxTypeIdx, S32, S32)
- .clampMaxNumElements(VecTypeIdx, S32, 32)
+ .clampScalar(EltTypeIdx, I32, I64)
+ .clampScalar(VecTypeIdx, I32, I64)
+ .clampScalar(IdxTypeIdx, I32, I32)
+ .clampMaxNumElements(VecTypeIdx, I32, 32)
+ .clampMaxNumElements(VecTypeIdx, F32, 32)
// TODO: Clamp elements for 64-bit vectors?
.moreElementsIf(
isIllegalRegisterType(VecTypeIdx),
@@ -1845,7 +1942,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
- .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+ .lowerIf(all(typeIs(LitTyIdx, I16), sizeIs(BigTyIdx, 32)))
.lowerIf([=](const LegalityQuery &Query) {
// Sub-vector(or single element) insert and extract.
// TODO: verify immediate offset here since lower only works with
@@ -1878,11 +1975,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V32S32)
- .clampNumElements(0, V2S64, V16S64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+ .legalForCartesianProduct(AllS32Vectors, {I32})
+ .legalForCartesianProduct(AllS32Vectors, {F32})
+ .legalForCartesianProduct(AllS64Vectors, {I64})
+ .legalForCartesianProduct(AllS64Vectors, {F64})
+ .clampNumElements(0, V16I32, V32I32)
+ .clampNumElements(0, V2I64, V16I64)
+ .fewerElementsIf(isWideVec16(0), changeElementCountTo(0, V2I16))
.moreElementsIf(
isIllegalRegisterType(0),
moreElementsToNextExistingRegClass(0));
@@ -1890,18 +1989,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasScalarPackInsts()) {
BuildVector
// FIXME: Should probably widen s1 vectors straight to s32
- .minScalarOrElt(0, S16)
- .minScalar(1, S16);
+ .minScalarOrElt(0, I16)
+ .minScalar(1, I16);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
- .legalFor({V2S16, S32})
+ .legalFor({V2I16, I32})
.lower();
} else {
- BuildVector.customFor({V2S16, S16});
- BuildVector.minScalarOrElt(0, S32);
+ BuildVector.customFor({V2I16, I16});
+ BuildVector.customFor({V2F16, F16});
+ BuildVector.customFor({V2BF16, BF16});
+ BuildVector.minScalarOrElt(0, I32);
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
- .customFor({V2S16, S32})
+ .customFor({V2I16, I32})
.lower();
}
@@ -1910,9 +2011,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
- .clampMaxNumElements(0, S32, 32)
- .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
- .clampMaxNumElements(0, S16, 64);
+ .clampMaxNumElements(0, I32, 32)
+ .clampMaxNumElements(1, I16, 2) // TODO: Make 4?
+ .clampMaxNumElements(0, I16, 64);
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
@@ -1935,23 +2036,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &Builder = getActionDefinitionsBuilder(Op)
.legalIf(all(isRegisterType(0), isRegisterType(1)))
- .lowerFor({{S16, V2S16}})
+ .lowerFor({{I16, V2I16}})
+ .lowerFor({{F16, V2F16}})
+ .lowerFor({{BF16, V2BF16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
return BigTy.getSizeInBits() == 32;
})
// Try to widen to s16 first for small types.
// TODO: Only do this on targets with legal s16 shifts
- .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
+ .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, I16)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
- .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
- elementTypeIs(1, S16)),
- changeTo(1, V2S16))
+ .fewerElementsIf(all(typeIs(0, I16), vectorWiderThan(1, 32),
+ elementTypeIs(1, I16)),
+ changeTo(1, V2I16))
+ .fewerElementsIf(all(typeIs(0, F16), vectorWiderThan(1, 32),
+ elementTypeIs(1, F16)),
+ changeTo(1, V2F16))
+ .fewerElementsIf(all(typeIs(0, BF16), vectorWiderThan(1, 32),
+ elementTypeIs(1, BF16)),
+ changeTo(1, V2BF16))
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
// worth considering the multiples of 64 since 2*192 and 2*384 are not
// valid.
- .clampScalar(LitTyIdx, S32, S512)
+ .clampScalar(LitTyIdx, I32, I512)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
// Break up vectors with weird elements into scalars
.fewerElementsIf(
@@ -1960,7 +2069,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
scalarize(1))
- .clampScalar(BigTyIdx, S32, MaxScalar);
+ .clampScalar(BigTyIdx, I32, MaxScalar);
if (Op == G_MERGE_VALUES) {
Builder.widenScalarIf(
@@ -1969,7 +2078,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT Ty = Query.Types[LitTyIdx];
return Ty.getSizeInBits() < 32;
},
- changeTo(LitTyIdx, S32));
+ changeTo(LitTyIdx, I32));
}
Builder.widenScalarIf(
@@ -1997,25 +2106,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
- .legalFor({{S32}, {S64}});
+ .legalFor({{I32}, {I64}});
if (ST.hasVOP3PInsts()) {
- SextInReg.lowerFor({{V2S16}})
+ SextInReg.lowerFor({{V2I16}})
// Prefer to reduce vector widths for 16-bit vectors before lowering, to
// get more vector shift opportunities, since we'll get those when
// expanded.
- .clampMaxNumElementsStrict(0, S16, 2);
+ .clampMaxNumElementsStrict(0, I16, 2);
} else if (ST.has16BitInsts()) {
- SextInReg.lowerFor({{S32}, {S64}, {S16}});
+ SextInReg.lowerFor({{I32}, {I64}, {I16}});
} else {
// Prefer to promote to s32 before lowering if we don't have 16-bit
// shifts. This avoid a lot of intermediate truncate and extend operations.
- SextInReg.lowerFor({{S32}, {S64}});
+ SextInReg.lowerFor({{I32}, {I64}});
}
SextInReg
.scalarize(0)
- .clampScalar(0, S32, S64)
+ .clampScalar(0, I32, I64)
.lower();
getActionDefinitionsBuilder({G_ROTR, G_ROTL})
@@ -2024,16 +2133,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
- .legalFor({{S32, S32}})
- .lowerFor({{V2S16, V2S16}})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .legalFor({{I32, I32}})
+ .lowerFor({{V2I16, V2I16}})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.lower();
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder(G_FSHL)
- .lowerFor({{V2S16, V2S16}})
- .clampMaxNumElementsStrict(0, S16, 2)
+ .lowerFor({{V2I16, V2I16}})
+ .clampMaxNumElementsStrict(0, I16, 2)
.scalarize(0)
.lower();
} else {
@@ -2043,22 +2152,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
- .legalFor({S64});
+ .legalFor({I64});
- getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
+ getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({I64});
getActionDefinitionsBuilder(G_FENCE)
.alwaysLegal();
getActionDefinitionsBuilder({G_SMULO, G_UMULO})
.scalarize(0)
- .minScalar(0, S32)
+ .minScalar(0, I32)
.lower();
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
- .legalFor({{S32, S32}, {S64, S32}})
- .clampScalar(1, S32, S32)
- .clampScalar(0, S32, S64)
+ .legalFor({{I32, I32}, {I64, I32}})
+ .clampScalar(1, I32, I32)
+ .clampScalar(0, I32, I64)
.widenScalarToNextPow2(0)
.scalarize(0);
@@ -2075,7 +2184,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasIEEEMinMax()) {
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
.legalFor(FPTypesPK16)
- .clampMaxNumElements(0, S16, 2)
+ .clampMaxNumElements(0, I16, 2)
.scalarize(0);
} else {
// TODO: Implement
@@ -2141,6 +2250,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
+ case TargetOpcode::G_FPEXT:
+ return legalizeFPExt(MI, MRI, B);
+ case TargetOpcode::G_FPTRUNC:
+ return legalizeFPTrunc(MI, MRI, B);
case TargetOpcode::G_STORE:
return legalizeStore(Helper, MI);
case TargetOpcode::G_FMAD:
@@ -2208,8 +2321,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
@@ -2227,10 +2338,10 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// Register TableGen definitions would need an overhaul to get rid of the
// artificial "HI" aperture registers and prevent this kind of issue from
// happening.
- Register Dst = MRI.createGenericVirtualRegister(S64);
+ Register Dst = MRI.createGenericVirtualRegister(I64);
MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
- return B.buildUnmerge(S32, Dst).getReg(1);
+ return B.buildUnmerge(I32, Dst).getReg(1);
}
// TODO: can we be smarter about machine pointer info?
@@ -2258,13 +2369,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(32), commonAlignment(Align(64), Offset));
+ I32, commonAlignment(Align(64), Offset));
// Pointer address
B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildConstant(I64, Offset).getReg(0));
// Load address
- return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+ return B.buildLoad(I32, LoadAddr, *MMO).getReg(0);
}
Register QueuePtr = MRI.createGenericVirtualRegister(
@@ -2281,11 +2392,11 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(32), commonAlignment(Align(64), StructOffset));
+ I32, commonAlignment(Align(64), StructOffset));
B.buildPtrAdd(LoadAddr, QueuePtr,
- B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
- return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
+ B.buildConstant(I64, StructOffset).getReg(0));
+ return B.buildLoad(I32, LoadAddr, *MMO).getReg(0);
}
/// Return true if the value is a known valid address, such that a null check is
@@ -2319,8 +2430,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
Intrinsic::amdgcn_addrspacecast_nonnull));
-
- const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
: MI.getOperand(1).getReg();
@@ -2362,7 +2471,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
auto CmpRes =
- B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, I1, Src, FlatNull.getReg(0));
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
@@ -2379,7 +2488,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// Coerce the type of the low half of the result so we can use
// merge_values.
- Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+ Register SrcAsInt = B.buildPtrToInt(I32, Src).getReg(0);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
@@ -2399,7 +2508,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
- auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
+ auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, I1, Src,
SegmentNull.getReg(0));
B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
@@ -2420,8 +2529,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
DstTy.getSizeInBits() == 64) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
uint32_t AddrHiVal = Info->get32BitAddressHighBits();
- auto PtrLo = B.buildPtrToInt(S32, Src);
- auto HighAddr = B.buildConstant(S32, AddrHiVal);
+ auto PtrLo = B.buildPtrToInt(I32, Src);
+ auto HighAddr = B.buildConstant(I32, AddrHiVal);
B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
MI.eraseFromParent();
return true;
@@ -2442,7 +2551,7 @@ bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
- assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
+ assert(Ty.isFloat() && Ty.getSizeInBits() == 64);
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
@@ -2466,25 +2575,21 @@ bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFceil(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
-
- const LLT S1 = LLT::scalar(1);
- const LLT S64 = LLT::scalar(64);
-
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src).isScalar(64));
+ assert(MRI.getType(Src).isFloat(64));
// result = trunc(src)
// if (src > 0.0 && src != result)
// result += 1.0
- auto Trunc = B.buildIntrinsicTrunc(S64, Src);
+ auto Trunc = B.buildIntrinsicTrunc(F64, Src);
- const auto Zero = B.buildFConstant(S64, 0.0);
- const auto One = B.buildFConstant(S64, 1.0);
- auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
- auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
- auto And = B.buildAnd(S1, Lt0, NeTrunc);
- auto Add = B.buildSelect(S64, And, One, Zero);
+ const auto Zero = B.buildFConstant(F64, 0.0);
+ const auto One = B.buildFConstant(F64, 1.0);
+ auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, I1, Src, Zero);
+ auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, I1, Src, Trunc);
+ auto And = B.buildAnd(I1, Lt0, NeTrunc);
+ auto Add = B.buildSelect(F64, And, One, Zero);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
@@ -2513,31 +2618,26 @@ static MachineInstrBuilder extractF64Exponent(Register Hi,
MachineIRBuilder &B) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
- LLT S32 = LLT::scalar(32);
- auto Const0 = B.buildConstant(S32, FractBits - 32);
- auto Const1 = B.buildConstant(S32, ExpBits);
+ auto Const0 = B.buildConstant(I32, FractBits - 32);
+ auto Const1 = B.buildConstant(I32, ExpBits);
- auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
+ auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {I32})
.addUse(Hi)
.addUse(Const0.getReg(0))
.addUse(Const1.getReg(0));
- return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
+ return B.buildSub(I32, ExpPart, B.buildConstant(I32, 1023));
}
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
-
Register Src = MI.getOperand(1).getReg();
- assert(MRI.getType(Src).isScalar(64));
+ assert(MRI.getType(Src).isFloat(64));
// TODO: Should this use extract since the low half is unused?
- auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+ auto Unmerge = B.buildUnmerge({I32, I32}, Src);
Register Hi = Unmerge.getReg(1);
// Extract the upper half, since this is where we will find the sign and
@@ -2547,25 +2647,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
const unsigned FractBits = 52;
// Extract the sign bit.
- const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
- auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
+ const auto SignBitMask = B.buildConstant(I32, UINT32_C(1) << 31);
+ auto SignBit = B.buildAnd(I32, Hi, SignBitMask);
- const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
+ const auto FractMask = B.buildConstant(I64, (UINT64_C(1) << FractBits) - 1);
- const auto Zero32 = B.buildConstant(S32, 0);
+ const auto Zero32 = B.buildConstant(I32, 0);
// Extend back to 64-bits.
- auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
+ auto SignBit64 = B.buildMergeLikeInstr(I64, {Zero32, SignBit});
+ SignBit64 = B.buildBitcast(F64, SignBit64);
- auto Shr = B.buildAShr(S64, FractMask, Exp);
- auto Not = B.buildNot(S64, Shr);
- auto Tmp0 = B.buildAnd(S64, Src, Not);
- auto FiftyOne = B.buildConstant(S32, FractBits - 1);
+ auto Shr = B.buildAShr(I64, FractMask, Exp);
+ auto Not = B.buildNot(I64, Shr);
+ auto Tmp0 = B.buildBitcast(F64, B.buildAnd(I64, Src, Not));
+ auto FiftyOne = B.buildConstant(I32, FractBits - 1);
- auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
- auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
+ auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, I1, Exp, Zero32);
+ auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, I1, Exp, FiftyOne);
- auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
+ auto Tmp1 = B.buildSelect(F64, ExpLt0, SignBit64, Tmp0);
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
MI.eraseFromParent();
return true;
@@ -2578,20 +2679,17 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
-
- assert(MRI.getType(Src).isScalar(64));
+ assert(MRI.getType(Src).isInteger(64));
- auto Unmerge = B.buildUnmerge({S32, S32}, Src);
- auto ThirtyTwo = B.buildConstant(S32, 32);
+ auto Unmerge = B.buildUnmerge({I32, I32}, Src);
+ auto ThirtyTwo = B.buildConstant(I32, 32);
- if (MRI.getType(Dst).isScalar(64)) {
- auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
- : B.buildUITOFP(S64, Unmerge.getReg(1));
+ if (MRI.getType(Dst).isFloat(64)) {
+ auto CvtHi = Signed ? B.buildSITOFP(F64, Unmerge.getReg(1))
+ : B.buildUITOFP(F64, Unmerge.getReg(1));
- auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
- auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
+ auto CvtLo = B.buildUITOFP(F64, Unmerge.getReg(0));
+ auto LdExp = B.buildFLdexp(F64, CvtHi, ThirtyTwo);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(Dst, LdExp, CvtLo);
@@ -2599,28 +2697,28 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
return true;
}
- assert(MRI.getType(Dst).isScalar(32));
+ assert(MRI.getType(Dst).isFloat(32));
- auto One = B.buildConstant(S32, 1);
+ auto One = B.buildConstant(I32, 1);
MachineInstrBuilder ShAmt;
if (Signed) {
- auto ThirtyOne = B.buildConstant(S32, 31);
- auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
- auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
- auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
- auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
+ auto ThirtyOne = B.buildConstant(I32, 31);
+ auto X = B.buildXor(I32, Unmerge.getReg(0), Unmerge.getReg(1));
+ auto OppositeSign = B.buildAShr(I32, X, ThirtyOne);
+ auto MaxShAmt = B.buildAdd(I32, ThirtyTwo, OppositeSign);
+ auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {I32})
.addUse(Unmerge.getReg(1));
- auto LS2 = B.buildSub(S32, LS, One);
- ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
+ auto LS2 = B.buildSub(I32, LS, One);
+ ShAmt = B.buildUMin(I32, LS2, MaxShAmt);
} else
- ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
- auto Norm = B.buildShl(S64, Src, ShAmt);
- auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
- auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
- auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
- auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
- auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
+ ShAmt = B.buildCTLZ(I32, Unmerge.getReg(1));
+ auto Norm = B.buildShl(I64, Src, ShAmt);
+ auto Unmerge2 = B.buildUnmerge({I32, I32}, Norm);
+ auto Adjust = B.buildUMin(I32, One, Unmerge2.getReg(0));
+ auto Norm2 = B.buildOr(I32, Unmerge2.getReg(1), Adjust);
+ auto FVal = Signed ? B.buildSITOFP(F32, Norm2) : B.buildUITOFP(F32, Norm2);
+ auto Scale = B.buildSub(I32, ThirtyTwo, ShAmt);
B.buildFLdexp(Dst, FVal, Scale);
MI.eraseFromParent();
return true;
@@ -2636,11 +2734,8 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
-
const LLT SrcLT = MRI.getType(Src);
- assert((SrcLT.isScalar(32) || SrcLT.isScalar(64)) && MRI.getType(Dst).isScalar(64));
+ assert((SrcLT.isFloat(32) || SrcLT.isFloat(64)) && MRI.getType(Dst).isInteger(64));
unsigned Flags = MI.getFlags();
@@ -2655,41 +2750,41 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
//
auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
MachineInstrBuilder Sign;
- if (Signed && SrcLT.isScalar(32)) {
+ if (Signed && SrcLT.isFloat(32)) {
// However, a 32-bit floating point number has only 23 bits mantissa and
// it's not enough to hold all the significant bits of `lof` if val is
// negative. To avoid the loss of precision, We need to take the absolute
// value after truncating and flip the result back based on the original
// signedness.
- Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
- Trunc = B.buildFAbs(S32, Trunc, Flags);
+ Sign = B.buildAShr(I32, B.buildBitcast(I32, Src), B.buildConstant(I32, 31));
+ Trunc = B.buildFAbs(F32, Trunc, Flags);
}
MachineInstrBuilder K0, K1;
- if (SrcLT.isScalar(64)) {
+ if (SrcLT.isFloat(64)) {
K0 = B.buildFConstant(
- S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
+ F64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
K1 = B.buildFConstant(
- S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
+ F64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
} else {
K0 = B.buildFConstant(
- S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
+ F32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
K1 = B.buildFConstant(
- S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
+ F32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
}
auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = (Signed && SrcLT.isScalar(64)) ? B.buildFPTOSI(S32, FloorMul)
- : B.buildFPTOUI(S32, FloorMul);
- auto Lo = B.buildFPTOUI(S32, Fma);
+ auto Hi = (Signed && SrcLT.isFloat(64)) ? B.buildFPTOSI(I32, FloorMul)
+ : B.buildFPTOUI(I32, FloorMul);
+ auto Lo = B.buildFPTOUI(I32, Fma);
- if (Signed && SrcLT.isScalar(32)) {
+ if (Signed && SrcLT.isFloat(32)) {
// Flip the result based on the signedness, which is either all 0s or 1s.
- Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
+ Sign = B.buildMergeLikeInstr(I64, {Sign, Sign});
// r := xor({lo, hi}, sign) - sign;
- B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
+ B.buildSub(Dst, B.buildXor(I64, B.buildMergeLikeInstr(I64, {Lo, Hi}), Sign),
Sign);
} else
B.buildMergeLikeInstr(Dst, {Lo, Hi});
@@ -2737,7 +2832,7 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// vector of integers using ptrtoint (and inttoptr on the output) in order to
// drive the legalization forward.
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
- LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntTy = LLT::integer(EltTy.getSizeInBits());
LLT IntVecTy = VecTy.changeElementType(IntTy);
auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
@@ -2790,7 +2885,7 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// new value, and then inttoptr the result vector back. This will then allow
// the rest of legalization to take over.
if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
- LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntTy = LLT::integer(EltTy.getSizeInBits());
LLT IntVecTy = VecTy.changeElementType(IntTy);
auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
@@ -2916,13 +3011,11 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
MachineRegisterInfo &MRI) const {
bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
- LLT S32 = LLT::scalar(32);
-
// Use the destination directly, if and only if we store the lower address
// part only and we don't have a register class being set.
Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
? DstReg
- : MRI.createGenericVirtualRegister(S32);
+ : MRI.createGenericVirtualRegister(I32);
if (!MRI.getRegClassOrNull(AddrLo))
MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
@@ -2937,7 +3030,7 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
assert(PtrTy.getSizeInBits() == 64 &&
"Must provide a 64-bit pointer type!");
- Register AddrHi = MRI.createGenericVirtualRegister(S32);
+ Register AddrHi = MRI.createGenericVirtualRegister(I32);
MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::S_MOV_B32)
@@ -2948,7 +3041,7 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
// class being set.
Register AddrDst = !MRI.getRegClassOrNull(DstReg)
? DstReg
- : MRI.createGenericVirtualRegister(LLT::scalar(64));
+ : MRI.createGenericVirtualRegister(I64);
if (!MRI.getRegClassOrNull(AddrDst))
MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
@@ -3017,8 +3110,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
// Adjust alignment for that dynamic shared memory array.
MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
- LLT S32 = LLT::scalar(32);
- auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
+ auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {I32});
B.buildIntToPtr(DstReg, Sz);
MI.eraseFromParent();
return true;
@@ -3078,7 +3170,7 @@ static LLT widenToNextPowerOf2(LLT Ty) {
if (Ty.isVector())
return Ty.changeElementCount(
ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
- return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
+ return LLT::integer(PowerOf2Ceil(Ty.getSizeInBits()));
}
bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
@@ -3188,6 +3280,50 @@ bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
return false;
}
+bool AMDGPULegalizerInfo::legalizeFPExt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: move to LegalizerHelper
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto ShiftTy = TLI->getPreferredShiftAmountTy(I32);
+
+ B.buildBitcast(
+ DstReg, B.buildShl(I32, B.buildAnyExt(I32, B.buildBitcast(I16, SrcReg)),
+ B.buildConstant(ShiftTy, 16)));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFPTrunc(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: move to LegalizerHelper
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto ShiftTy = TLI->getPreferredShiftAmountTy(I32);
+
+ // FIXME:
+ // if (!DAG.isKnownNeverSNaN(Op)) {
+ // Op = DAG.getNode(ISD::FCANONICALIZE, dl, MVT::f32, Op,
+ // Node->getFlags());
+ // }
+
+ B.buildBitcast(
+ DstReg, B.buildTrunc(I16, B.buildLShr(I32, B.buildBitcast(I32, SrcReg),
+ B.buildConstant(ShiftTy, 16))));
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFMad(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -3258,7 +3394,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
break;
}
case TargetOpcode::G_FPEXT: {
- return MRI.getType(DefMI->getOperand(1).getReg()).isScalar(16);
+ return MRI.getType(DefMI->getOperand(1).getReg()).isFloat(16);
}
default:
return false;
@@ -3287,11 +3423,10 @@ AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
return {};
- const LLT F32 = LLT::scalar(32);
auto SmallestNormal = B.buildFConstant(
F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
auto IsLtSmallestNormal =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Src, SmallestNormal);
auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
auto One = B.buildFConstant(F32, 1.0);
@@ -3315,8 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();
- if (Ty.isScalar(16)) {
- const LLT F32 = LLT::scalar(32);
+ if (Ty.isFloat(16)) {
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
@@ -3327,7 +3461,7 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
return true;
}
- assert(Ty.isScalar(32));
+ assert(Ty.isFloat(32));
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
if (!ScaledInput) {
@@ -3370,9 +3504,6 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const LLT Ty = MRI.getType(X);
MachineFunction &MF = B.getMF();
- const LLT F32 = LLT::scalar(32);
- const LLT F16 = LLT::scalar(16);
-
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
@@ -3448,7 +3579,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
auto Fabs = B.buildFAbs(Ty, Y);
auto IsFinite =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Fabs, Inf, Flags);
R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
}
@@ -3474,7 +3605,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
LLT Ty = B.getMRI()->getType(Dst);
- if (Ty.isScalar(32)) {
+ if (Ty.isFloat(32)) {
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
if (ScaledInput) {
auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
@@ -3497,7 +3628,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
}
}
- auto Log2Operand = Ty.isScalar(16)
+ auto Log2Operand = Ty.isFloat(16)
? B.buildFLog2(Ty, Src, Flags)
: B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
@@ -3516,8 +3647,6 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
Register Src = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
- const LLT F16 = LLT::scalar(16);
- const LLT F32 = LLT::scalar(32);
if (Ty == F16) {
// Nothing in half is a denormal when promoted to f32.
@@ -3568,7 +3697,6 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
- LLT F32 = LLT::scalar(32);
if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
auto Log2E = B.buildFConstant(Ty, numbers::log2e);
@@ -3587,7 +3715,7 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
auto NeedsScaling =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::integer(1), X, Threshold, Flags);
auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
@@ -3613,8 +3741,6 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
MachineFunction &MF = B.getMF();
MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(Dst);
- const LLT F16 = LLT::scalar(16);
- const LLT F32 = LLT::scalar(32);
const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
if (Ty == F16) {
@@ -3715,7 +3841,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// It is unsafe to contract this fsub into the PH multiply.
auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
- auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
+ auto IntE = B.buildFPTOSI(I32, E);
auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
.addUse(A.getReg(0))
@@ -3726,7 +3852,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
auto Zero = B.buildFConstant(Ty, 0.0);
auto Underflow =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, X, UnderflowCheckConst);
R = B.buildSelect(Ty, Underflow, Zero, R);
@@ -3737,7 +3863,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
auto Overflow =
- B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
+ B.buildFCmp(CmpInst::FCMP_OGT, I1, X, OverflowCheckConst);
auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
}
@@ -3795,7 +3921,6 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S1 = LLT::scalar(1);
Register Dst = MI.getOperand(0).getReg();
Register OrigSrc = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
@@ -3836,7 +3961,7 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
Register CorrectedFract = Min;
if (!MI.getFlag(MachineInstr::FmNoNans)) {
- auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
+ auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, I1, ModSrc, ModSrc, Flags);
CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
}
@@ -3852,20 +3977,26 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeBuildVector(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
assert(MRI.getType(Dst).isFixedVector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
+ LLT Src0Ty = MRI.getType(Src0);
+ LLT Src1Ty = MRI.getType(Src1);
+
if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
assert(MRI.getType(Src0).isScalar(32));
- Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
- Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
+ Src0 = B.buildTrunc(I16, Src0).getReg(0);
+ Src1 = B.buildTrunc(I16, Src1).getReg(0);
}
- auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
+ if (Src0Ty.isFloat() && Src1Ty.isFloat()) {
+ Src0 = B.buildBitcast(I16, Src0).getReg(0);
+ Src1 = B.buildBitcast(I16, Src1).getReg(0);
+ }
+
+ auto Merge = B.buildMergeLikeInstr(I32, {Src0, Src1});
B.buildBitcast(Dst, Merge);
MI.eraseFromParent();
@@ -3894,21 +4025,17 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
MachineIRBuilder &B = Helper.MIRBuilder;
GISelKnownBits &KB = *Helper.getKnownBits();
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
-
Register Zero32;
Register Zero64;
auto getZero32 = [&]() -> Register {
if (!Zero32)
- Zero32 = B.buildConstant(S32, 0).getReg(0);
+ Zero32 = B.buildConstant(I32, 0).getReg(0);
return Zero32;
};
auto getZero64 = [&]() -> Register {
if (!Zero64)
- Zero64 = B.buildConstant(S64, 0).getReg(0);
+ Zero64 = B.buildConstant(I64, 0).getReg(0);
return Zero64;
};
@@ -3931,16 +4058,16 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
Register CarryAccum;
if (CarryIn.size() == 1) {
if (!LocalAccum) {
- LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ LocalAccum = B.buildZExt(I32, CarryIn[0]).getReg(0);
return Register();
}
CarryAccum = getZero32();
} else {
- CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ CarryAccum = B.buildZExt(I32, CarryIn[0]).getReg(0);
for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
CarryAccum =
- B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
+ B.buildUAdde(I32, I1, CarryAccum, getZero32(), CarryIn[i])
.getReg(0);
}
@@ -3951,7 +4078,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
}
auto Add =
- B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
+ B.buildUAdde(I32, I1, CarryAccum, LocalAccum, CarryIn.back());
LocalAccum = Add.getReg(0);
return HaveCarryOut ? Add.getReg(1) : Register();
};
@@ -3986,15 +4113,15 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
continue;
}
- auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
+ auto Mul = B.buildMul(I32, Src0[j0], Src1[j1]);
if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
LocalAccum[0] = Mul.getReg(0);
} else {
if (CarryIn.empty()) {
- LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
+ LocalAccum[0] = B.buildAdd(I32, LocalAccum[0], Mul).getReg(0);
} else {
LocalAccum[0] =
- B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
+ B.buildUAdde(I32, I1, LocalAccum[0], Mul, CarryIn.back())
.getReg(0);
CarryIn.pop_back();
}
@@ -4010,13 +4137,13 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
if (LocalAccum[0]) {
if (LocalAccum.size() == 1) {
- Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
+ Tmp = B.buildAnyExt(I64, LocalAccum[0]).getReg(0);
HaveSmallAccum = true;
} else if (LocalAccum[1]) {
- Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
+ Tmp = B.buildMergeLikeInstr(I64, LocalAccum).getReg(0);
HaveSmallAccum = false;
} else {
- Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
+ Tmp = B.buildZExt(I64, LocalAccum[0]).getReg(0);
HaveSmallAccum = true;
}
} else {
@@ -4031,7 +4158,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
continue;
}
- auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
+ auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {I64, I1},
{Src0[j0], Src1[j1], Tmp});
Tmp = Mad.getReg(0);
if (!HaveSmallAccum)
@@ -4041,7 +4168,7 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
++j0;
} while (j0 <= DstIndex);
- auto Unmerge = B.buildUnmerge(S32, Tmp);
+ auto Unmerge = B.buildUnmerge(I32, Tmp);
LocalAccum[0] = Unmerge.getReg(0);
if (LocalAccum.size() > 1)
LocalAccum[1] = Unmerge.getReg(1);
@@ -4099,17 +4226,17 @@ void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
if (i == 1) {
if (!IsHighest)
- Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
+ Lo = B.buildUAddo(I32, I1, Accum[2 * i - 1], SeparateOddOut[0]);
else
- Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
+ Lo = B.buildAdd(I32, Accum[2 * i - 1], SeparateOddOut[0]);
} else {
- Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
+ Lo = B.buildUAdde(I32, I1, Accum[2 * i - 1], SeparateOddOut[0],
SeparateOddCarry);
}
Accum[2 * i - 1] = Lo->getOperand(0).getReg();
if (!IsHighest) {
- auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
+ auto Hi = B.buildUAdde(I32, I1, Accum[2 * i], SeparateOddOut[1],
Lo->getOperand(1).getReg());
Accum[2 * i] = Hi.getReg(0);
SeparateOddCarry = Hi.getReg(1);
@@ -4147,7 +4274,7 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
Register Src1 = MI.getOperand(2).getReg();
LLT Ty = MRI.getType(DstReg);
- assert(Ty.isScalar());
+ assert(Ty.isInteger());
unsigned Size = Ty.getSizeInBits();
unsigned NumParts = Size / 32;
@@ -4164,11 +4291,10 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
// in an even-aligned VGPR.
const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
- LLT S32 = LLT::scalar(32);
SmallVector<Register, 2> Src0Parts, Src1Parts;
for (unsigned i = 0; i < NumParts; ++i) {
- Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
- Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
+ Src0Parts.push_back(MRI.createGenericVirtualRegister(I32));
+ Src1Parts.push_back(MRI.createGenericVirtualRegister(I32));
}
B.buildUnmerge(Src0Parts, Src0);
B.buildUnmerge(Src1Parts, Src1);
@@ -4213,10 +4339,10 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
assert(NumBits < 32u);
- auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
- auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
- auto Shift = B.buildShl(S32, Extend, ShiftAmt);
- auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
+ auto ShiftAmt = B.buildConstant(I32, 32u - NumBits);
+ auto Extend = B.buildAnyExt(I32, {Src}).getReg(0u);
+ auto Shift = B.buildShl(I32, Extend, ShiftAmt);
+ auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {I32}, {Shift});
B.buildTrunc(Dst, Ctlz);
MI.eraseFromParent();
return true;
@@ -4285,7 +4411,6 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
*ArgRC, B.getDebugLoc(), ArgTy);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
- const LLT S32 = LLT::scalar(32);
const unsigned Mask = Arg->getMask();
const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
@@ -4294,11 +4419,11 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
// TODO: Avoid clearing the high bits if we know workitem id y/z are always
// 0.
if (Shift != 0) {
- auto ShiftAmt = B.buildConstant(S32, Shift);
- AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+ auto ShiftAmt = B.buildConstant(I32, Shift);
+ AndMaskSrc = B.buildLShr(I32, LiveIn, ShiftAmt).getReg(0);
}
- B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
+ B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(I32, Mask >> Shift));
} else {
B.buildCopy(DstReg, LiveIn);
}
@@ -4331,17 +4456,17 @@ bool AMDGPULegalizerInfo::loadInputValue(
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
Arg = &WorkGroupIDY;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
Arg = &WorkGroupIDZ;
ArgRC = &AMDGPU::SReg_32RegClass;
- ArgTy = LLT::scalar(32);
+ ArgTy = I32;
break;
default:
break;
@@ -4438,7 +4563,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
llvm_unreachable("failed to find kernarg segment ptr");
- auto COffset = B.buildConstant(LLT::scalar(64), Offset);
+ auto COffset = B.buildConstant(LLT::integer(64), Offset);
// TODO: Should get nuw
return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
}
@@ -4469,11 +4594,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
- if (DstTy.isScalar(16))
+ if (DstTy.isFloat(16))
return legalizeFDIV16(MI, MRI, B);
- if (DstTy.isScalar(32))
+ if (DstTy.isFloat(32))
return legalizeFDIV32(MI, MRI, B);
- if (DstTy.isScalar(64))
+ if (DstTy.isFloat(64))
return legalizeFDIV64(MI, MRI, B);
return false;
@@ -4484,42 +4609,39 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
Register DstRemReg,
Register X,
Register Y) const {
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
-
// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
// algorithm used here.
// Initial estimate of inv(y).
- auto FloatY = B.buildUITOFP(S32, Y);
- auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
- auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
- auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
- auto Z = B.buildFPTOUI(S32, ScaledY);
+ auto FloatY = B.buildUITOFP(F32, Y);
+ auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {F32}, {FloatY});
+ auto Scale = B.buildFConstant(F32, llvm::bit_cast<float>(0x4f7ffffe));
+ auto ScaledY = B.buildFMul(F32, RcpIFlag, Scale);
+ auto Z = B.buildFPTOUI(I32, ScaledY);
// One round of UNR.
- auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
- auto NegYZ = B.buildMul(S32, NegY, Z);
- Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
+ auto NegY = B.buildSub(I32, B.buildConstant(I32, 0), Y);
+ auto NegYZ = B.buildMul(I32, NegY, Z);
+ Z = B.buildAdd(I32, Z, B.buildUMulH(I32, Z, NegYZ));
// Quotient/remainder estimate.
- auto Q = B.buildUMulH(S32, X, Z);
- auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
+ auto Q = B.buildUMulH(I32, X, Z);
+ auto R = B.buildSub(I32, X, B.buildMul(I32, Q, Y));
// First quotient/remainder refinement.
- auto One = B.buildConstant(S32, 1);
- auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ auto One = B.buildConstant(I32, 1);
+ auto Cond = B.buildICmp(CmpInst::ICMP_UGE, I1, R, Y);
if (DstDivReg)
- Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
- R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
+ Q = B.buildSelect(I32, Cond, B.buildAdd(I32, Q, One), Q);
+ R = B.buildSelect(I32, Cond, B.buildSub(I32, R, Y), R);
// Second quotient/remainder refinement.
- Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ Cond = B.buildICmp(CmpInst::ICMP_UGE, I1, R, Y);
if (DstDivReg)
- B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
+ B.buildSelect(DstDivReg, Cond, B.buildAdd(I32, Q, One), Q);
if (DstRemReg)
- B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
+ B.buildSelect(DstRemReg, Cond, B.buildSub(I32, R, Y), R);
}
// Build integer reciprocal sequence around V_RCP_IFLAG_F32
@@ -4537,32 +4659,31 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
Register Val) {
- const LLT S32 = LLT::scalar(32);
- auto Unmerge = B.buildUnmerge(S32, Val);
+ auto Unmerge = B.buildUnmerge(I32, Val);
- auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
- auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
+ auto CvtLo = B.buildUITOFP(F32, Unmerge.getReg(0));
+ auto CvtHi = B.buildUITOFP(F32, Unmerge.getReg(1));
auto Mad = B.buildFMAD(
- S32, CvtHi, // 2**32
- B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
+ F32, CvtHi, // 2**32
+ B.buildFConstant(F32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
- auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
+ auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {F32}, {Mad});
auto Mul1 = B.buildFMul(
- S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
+ F32, Rcp, B.buildFConstant(F32, llvm::bit_cast<float>(0x5f7ffffc)));
// 2**(-32)
auto Mul2 = B.buildFMul(
- S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
- auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
+ F32, Mul1, B.buildFConstant(F32, llvm::bit_cast<float>(0x2f800000)));
+ auto Trunc = B.buildIntrinsicTrunc(F32, Mul2);
// -(2**32)
auto Mad2 = B.buildFMAD(
- S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
+ F32, Trunc, B.buildFConstant(F32, llvm::bit_cast<float>(0xcf800000)),
Mul1);
- auto ResultLo = B.buildFPTOUI(S32, Mad2);
- auto ResultHi = B.buildFPTOUI(S32, Trunc);
+ auto ResultLo = B.buildFPTOUI(I32, Mad2);
+ auto ResultHi = B.buildFPTOUI(I32, Trunc);
return {ResultLo.getReg(0), ResultHi.getReg(0)};
}
@@ -4572,109 +4693,106 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
Register DstRemReg,
Register Numer,
Register Denom) const {
- const LLT S32 = LLT::scalar(32);
- const LLT S64 = LLT::scalar(64);
- const LLT S1 = LLT::scalar(1);
Register RcpLo, RcpHi;
std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
- auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
+ auto Rcp = B.buildMergeLikeInstr(I64, {RcpLo, RcpHi});
- auto Zero64 = B.buildConstant(S64, 0);
- auto NegDenom = B.buildSub(S64, Zero64, Denom);
+ auto Zero64 = B.buildConstant(I64, 0);
+ auto NegDenom = B.buildSub(I64, Zero64, Denom);
- auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
- auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
+ auto MulLo1 = B.buildMul(I64, NegDenom, Rcp);
+ auto MulHi1 = B.buildUMulH(I64, Rcp, MulLo1);
- auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
+ auto UnmergeMulHi1 = B.buildUnmerge(I32, MulHi1);
Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
- auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
- auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
- auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
+ auto Add1_Lo = B.buildUAddo(I32, I1, RcpLo, MulHi1_Lo);
+ auto Add1_Hi = B.buildUAdde(I32, I1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
+ auto Add1 = B.buildMergeLikeInstr(I64, {Add1_Lo, Add1_Hi});
- auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
- auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
- auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
+ auto MulLo2 = B.buildMul(I64, NegDenom, Add1);
+ auto MulHi2 = B.buildUMulH(I64, Add1, MulLo2);
+ auto UnmergeMulHi2 = B.buildUnmerge(I32, MulHi2);
Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
- auto Zero32 = B.buildConstant(S32, 0);
- auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
- auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
- auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
+ auto Zero32 = B.buildConstant(I32, 0);
+ auto Add2_Lo = B.buildUAddo(I32, I1, Add1_Lo, MulHi2_Lo);
+ auto Add2_Hi = B.buildUAdde(I32, I1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
+ auto Add2 = B.buildMergeLikeInstr(I64, {Add2_Lo, Add2_Hi});
- auto UnmergeNumer = B.buildUnmerge(S32, Numer);
+ auto UnmergeNumer = B.buildUnmerge(I32, Numer);
Register NumerLo = UnmergeNumer.getReg(0);
Register NumerHi = UnmergeNumer.getReg(1);
- auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
- auto Mul3 = B.buildMul(S64, Denom, MulHi3);
- auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
+ auto MulHi3 = B.buildUMulH(I64, Numer, Add2);
+ auto Mul3 = B.buildMul(I64, Denom, MulHi3);
+ auto UnmergeMul3 = B.buildUnmerge(I32, Mul3);
Register Mul3_Lo = UnmergeMul3.getReg(0);
Register Mul3_Hi = UnmergeMul3.getReg(1);
- auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
- auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
- auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
- auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
+ auto Sub1_Lo = B.buildUSubo(I32, I1, NumerLo, Mul3_Lo);
+ auto Sub1_Hi = B.buildUSube(I32, I1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
+ auto Sub1_Mi = B.buildSub(I32, NumerHi, Mul3_Hi);
+ auto Sub1 = B.buildMergeLikeInstr(I64, {Sub1_Lo, Sub1_Hi});
- auto UnmergeDenom = B.buildUnmerge(S32, Denom);
+ auto UnmergeDenom = B.buildUnmerge(I32, Denom);
Register DenomLo = UnmergeDenom.getReg(0);
Register DenomHi = UnmergeDenom.getReg(1);
- auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
- auto C1 = B.buildSExt(S32, CmpHi);
+ auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, I1, Sub1_Hi, DenomHi);
+ auto C1 = B.buildSExt(I32, CmpHi);
- auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
- auto C2 = B.buildSExt(S32, CmpLo);
+ auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, I1, Sub1_Lo, DenomLo);
+ auto C2 = B.buildSExt(I32, CmpLo);
- auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
- auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
+ auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, I1, Sub1_Hi, DenomHi);
+ auto C3 = B.buildSelect(I32, CmpEq, C2, C1);
// TODO: Here and below portions of the code can be enclosed into if/endif.
// Currently control flow is unconditional and we have 4 selects after
// potential endif to substitute PHIs.
// if C3 != 0 ...
- auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
- auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
- auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
- auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
+ auto Sub2_Lo = B.buildUSubo(I32, I1, Sub1_Lo, DenomLo);
+ auto Sub2_Mi = B.buildUSube(I32, I1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
+ auto Sub2_Hi = B.buildUSube(I32, I1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
+ auto Sub2 = B.buildMergeLikeInstr(I64, {Sub2_Lo, Sub2_Hi});
- auto One64 = B.buildConstant(S64, 1);
- auto Add3 = B.buildAdd(S64, MulHi3, One64);
+ auto One64 = B.buildConstant(I64, 1);
+ auto Add3 = B.buildAdd(I64, MulHi3, One64);
auto C4 =
- B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
+ B.buildSExt(I32, B.buildICmp(CmpInst::ICMP_UGE, I1, Sub2_Hi, DenomHi));
auto C5 =
- B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
+ B.buildSExt(I32, B.buildICmp(CmpInst::ICMP_UGE, I1, Sub2_Lo, DenomLo));
auto C6 = B.buildSelect(
- S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
+ I32, B.buildICmp(CmpInst::ICMP_EQ, I1, Sub2_Hi, DenomHi), C5, C4);
// if (C6 != 0)
- auto Add4 = B.buildAdd(S64, Add3, One64);
- auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
+ auto Add4 = B.buildAdd(I64, Add3, One64);
+ auto Sub3_Lo = B.buildUSubo(I32, I1, Sub2_Lo, DenomLo);
- auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
- auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
- auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
+ auto Sub3_Mi = B.buildUSube(I32, I1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
+ auto Sub3_Hi = B.buildUSube(I32, I1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
+ auto Sub3 = B.buildMergeLikeInstr(I64, {Sub3_Lo, Sub3_Hi});
// endif C6
// endif C3
if (DstDivReg) {
auto Sel1 = B.buildSelect(
- S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
- B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ I64, B.buildICmp(CmpInst::ICMP_NE, I1, C6, Zero32), Add4, Add3);
+ B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, I1, C3, Zero32),
Sel1, MulHi3);
}
if (DstRemReg) {
auto Sel2 = B.buildSelect(
- S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
- B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ I64, B.buildICmp(CmpInst::ICMP_NE, I1, C6, Zero32), Sub3, Sub2);
+ B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, I1, C3, Zero32),
Sel2, Sub1);
}
}
@@ -4706,9 +4824,9 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty.isScalar(32))
+ if (Ty.isInteger(32))
legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
- else if (Ty.isScalar(64))
+ else if (Ty.isInteger(64))
legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
@@ -4720,17 +4838,15 @@ bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT S32 = LLT::scalar(32);
-
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (!Ty.isScalar(32) && !Ty.isScalar(64))
+ if (!Ty.isInteger(32) && !Ty.isInteger(64))
return false;
const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
- auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
+ auto SignBitOffset = B.buildConstant(I32, Ty.getSizeInBits() - 1);
auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
@@ -4763,7 +4879,7 @@ bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
}
}
- if (Ty.isScalar(32))
+ if (Ty.isInteger(32))
legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
@@ -4895,9 +5011,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S16 = LLT::scalar(16);
- LLT S32 = LLT::scalar(32);
-
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
@@ -4911,27 +5024,28 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
// q16.u = opx(V_CVT_F16_F32, q32.u);
// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
- auto LHSExt = B.buildFPExt(S32, LHS, Flags);
- auto RHSExt = B.buildFPExt(S32, RHS, Flags);
- auto NegRHSExt = B.buildFNeg(S32, RHSExt);
- auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto LHSExt = B.buildFPExt(F32, LHS, Flags);
+ auto RHSExt = B.buildFPExt(F32, RHS, Flags);
+ auto NegRHSExt = B.buildFNeg(F32, RHSExt);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(RHSExt.getReg(0))
.setMIFlags(Flags);
- auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
+ auto Quot = B.buildFMul(F32, LHSExt, Rcp, Flags);
MachineInstrBuilder Err;
if (ST.hasMadMacF32Insts()) {
- Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
- Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
- Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Err = B.buildFMAD(F32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMAD(F32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMAD(F32, NegRHSExt, Quot, LHSExt, Flags);
} else {
- Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
- Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
- Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
- }
- auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
- Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
- Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
- auto RDst = B.buildFPTrunc(S16, Quot, Flags);
+ Err = B.buildFMA(F32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMA(F32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMA(F32, NegRHSExt, Quot, LHSExt, Flags);
+ }
+ auto Tmp = B.buildFMul(F32, Err, Rcp, Flags);
+ Tmp = B.buildAnd(I32, Tmp, B.buildConstant(I32, 0xff800000));
+ Tmp = B.buildBitcast(F32, Tmp);
+ Quot = B.buildFAdd(F32, Tmp, Quot, Flags);
+ auto RDst = B.buildFPTrunc(F16, Quot, Flags);
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
.addUse(RDst.getReg(0))
.addUse(RHS)
@@ -4983,28 +5097,25 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S32 = LLT::scalar(32);
- LLT S1 = LLT::scalar(1);
-
- auto One = B.buildFConstant(S32, 1.0f);
+ auto One = B.buildFConstant(F32, 1.0f);
auto DenominatorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F32, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(0)
.setMIFlags(Flags);
auto NumeratorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F32, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(1)
.setMIFlags(Flags);
- auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(DenominatorScaled.getReg(0))
.setMIFlags(Flags);
- auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
+ auto NegDivScale0 = B.buildFNeg(F32, DenominatorScaled, Flags);
const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
const bool HasDynamicDenormals =
@@ -5022,12 +5133,12 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
toggleSPDenormMode(true, B, ST, Mode);
}
- auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
- auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
- auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
- auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
- auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
- auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
+ auto Fma0 = B.buildFMA(F32, NegDivScale0, ApproxRcp, One, Flags);
+ auto Fma1 = B.buildFMA(F32, Fma0, ApproxRcp, ApproxRcp, Flags);
+ auto Mul = B.buildFMul(F32, NumeratorScaled, Fma1, Flags);
+ auto Fma2 = B.buildFMA(F32, NegDivScale0, Mul, NumeratorScaled, Flags);
+ auto Fma3 = B.buildFMA(F32, Fma2, Fma1, Mul, Flags);
+ auto Fma4 = B.buildFMA(F32, NegDivScale0, Fma3, NumeratorScaled, Flags);
if (!PreservesDenormals) {
if (HasDynamicDenormals) {
@@ -5039,7 +5150,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
toggleSPDenormMode(false, B, ST, Mode);
}
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {F32})
.addUse(Fma4.getReg(0))
.addUse(Fma1.getReg(0))
.addUse(Fma3.getReg(0))
@@ -5068,59 +5179,54 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
- LLT S64 = LLT::scalar(64);
- LLT S1 = LLT::scalar(1);
-
- auto One = B.buildFConstant(S64, 1.0);
+ auto One = B.buildFConstant(F64, 1.0);
- auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F64, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(0)
.setMIFlags(Flags);
- auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
+ auto NegDivScale0 = B.buildFNeg(F64, DivScale0.getReg(0), Flags);
- auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F64})
.addUse(DivScale0.getReg(0))
.setMIFlags(Flags);
- auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
- auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
- auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
+ auto Fma0 = B.buildFMA(F64, NegDivScale0, Rcp, One, Flags);
+ auto Fma1 = B.buildFMA(F64, Rcp, Fma0, Rcp, Flags);
+ auto Fma2 = B.buildFMA(F64, NegDivScale0, Fma1, One, Flags);
- auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {F64, I1})
.addUse(LHS)
.addUse(RHS)
.addImm(1)
.setMIFlags(Flags);
- auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
- auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
- auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
+ auto Fma3 = B.buildFMA(F64, Fma1, Fma2, Fma1, Flags);
+ auto Mul = B.buildFMul(F64, DivScale1.getReg(0), Fma3, Flags);
+ auto Fma4 = B.buildFMA(F64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
Register Scale;
if (!ST.hasUsableDivScaleConditionOutput()) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
- LLT S32 = LLT::scalar(32);
+ auto NumUnmerge = B.buildUnmerge(I32, LHS);
+ auto DenUnmerge = B.buildUnmerge(I32, RHS);
+ auto Scale0Unmerge = B.buildUnmerge(I32, DivScale0);
+ auto Scale1Unmerge = B.buildUnmerge(I32, DivScale1);
- auto NumUnmerge = B.buildUnmerge(S32, LHS);
- auto DenUnmerge = B.buildUnmerge(S32, RHS);
- auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
- auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
-
- auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
+ auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, I1, NumUnmerge.getReg(1),
Scale1Unmerge.getReg(1));
- auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
+ auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, I1, DenUnmerge.getReg(1),
Scale0Unmerge.getReg(1));
- Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
+ Scale = B.buildXor(I1, CmpNum, CmpDen).getReg(0);
} else {
Scale = DivScale1.getReg(1);
}
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {F64})
.addUse(Fma4.getReg(0))
.addUse(Fma3.getReg(0))
.addUse(Mul.getReg(0))
@@ -5146,7 +5252,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT Ty = MRI.getType(Res0);
- LLT InstrExpTy = Ty.isScalar(16) ? LLT::scalar(16) : LLT::scalar(32);
+ LLT InstrExpTy = Ty.isFloat(16) ? I16 : I32;
auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
@@ -5159,7 +5265,7 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
auto Fabs = B.buildFAbs(Ty, Val);
auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
auto IsFinite =
- B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ B.buildFCmp(CmpInst::FCMP_OLT, I1, Fabs, Inf, Flags);
auto Zero = B.buildConstant(InstrExpTy, 0);
Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
@@ -5180,26 +5286,23 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
Register RHS = MI.getOperand(3).getReg();
uint16_t Flags = MI.getFlags();
- LLT S32 = LLT::scalar(32);
- LLT S1 = LLT::scalar(1);
-
- auto Abs = B.buildFAbs(S32, RHS, Flags);
+ auto Abs = B.buildFAbs(F32, RHS, Flags);
const APFloat C0Val(1.0f);
- auto C0 = B.buildFConstant(S32, 0x1p+96f);
- auto C1 = B.buildFConstant(S32, 0x1p-32f);
- auto C2 = B.buildFConstant(S32, 1.0f);
+ auto C0 = B.buildFConstant(F32, 0x1p+96f);
+ auto C1 = B.buildFConstant(F32, 0x1p-32f);
+ auto C2 = B.buildFConstant(F32, 1.0f);
- auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
- auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
+ auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, I1, Abs, C0, Flags);
+ auto Sel = B.buildSelect(F32, CmpRes, C1, C2, Flags);
- auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
+ auto Mul0 = B.buildFMul(F32, RHS, Sel, Flags);
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {F32})
.addUse(Mul0.getReg(0))
.setMIFlags(Flags);
- auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
+ auto Mul1 = B.buildFMul(F32, LHS, RCP, Flags);
B.buildFMul(Res, Sel, Mul1, Flags);
@@ -5214,7 +5317,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
// get. The f32 op is accurate enough for the f16 cas.
unsigned Flags = MI.getFlags();
assert(!ST.has16BitInsts());
- const LLT F32 = LLT::scalar(32);
auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
.addUse(Ext.getReg(0))
@@ -5231,9 +5333,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
Register X = MI.getOperand(1).getReg();
const unsigned Flags = MI.getFlags();
- const LLT S1 = LLT::scalar(1);
- const LLT F32 = LLT::scalar(32);
- const LLT I32 = LLT::scalar(32);
if (allowApproxFunc(MF, Flags)) {
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
@@ -5244,7 +5343,7 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
}
auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
- auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
+ auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, I1, ScaleThreshold, X, Flags);
auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
@@ -5254,26 +5353,27 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
.addUse(SqrtX.getReg(0))
.setMIFlags(Flags);
+ auto SqrtSInt = B.buildBitcast(I32, SqrtS);
auto NegOne = B.buildConstant(I32, -1);
- auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
+ auto SqrtSNextDown = B.buildBitcast(F32, B.buildAdd(I32, SqrtSInt, NegOne));
auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
auto PosOne = B.buildConstant(I32, 1);
- auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
+ auto SqrtSNextUp = B.buildBitcast(F32, B.buildAdd(I32, SqrtSInt, PosOne));
auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
auto Zero = B.buildFConstant(F32, 0.0f);
- auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
+ auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, I1, SqrtVP, Zero, Flags);
SqrtS =
B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
- auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
+ auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, I1, SqrtVS, Zero, Flags);
SqrtS =
B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
} else {
@@ -5298,7 +5398,7 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
- auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+ auto IsZeroOrInf = B.buildIsFPClass(I1, SqrtX, fcZero | fcPosInf);
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
MI.eraseFromParent();
@@ -5328,10 +5428,6 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
//
// sqrt(x) = g3
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
- const LLT F64 = LLT::scalar(64);
-
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
@@ -5340,12 +5436,12 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
- auto ZeroInt = B.buildConstant(S32, 0);
- auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+ auto ZeroInt = B.buildConstant(I32, 0);
+ auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, I1, X, ScaleConstant);
// Scale up input if it is too small.
- auto ScaleUpFactor = B.buildConstant(S32, 256);
- auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+ auto ScaleUpFactor = B.buildConstant(I32, 256);
+ auto ScaleUp = B.buildSelect(I32, Scaling, ScaleUpFactor, ZeroInt);
auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
auto SqrtY =
@@ -5372,15 +5468,15 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
// Scale down the result.
- auto ScaleDownFactor = B.buildConstant(S32, -128);
- auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+ auto ScaleDownFactor = B.buildConstant(I32, -128);
+ auto ScaleDown = B.buildSelect(I32, Scaling, ScaleDownFactor, ZeroInt);
SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
// with finite only or nsz because rsq(+/-0) = +/-inf
// TODO: Check for DAZ and expand to subnormals
- auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+ auto IsZeroOrInf = B.buildIsFPClass(I1, SqrtX, fcZero | fcPosInf);
// If x is +INF, +0, or -0, use its original value
B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
@@ -5393,11 +5489,11 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- if (Ty.isScalar(32))
+ if (Ty.isFloat(32))
return legalizeFSQRTF32(MI, MRI, B);
- if (Ty.isScalar(64))
+ if (Ty.isFloat(64))
return legalizeFSQRTF64(MI, MRI, B);
- if (Ty.isScalar(16))
+ if (Ty.isFloat(16))
return legalizeFSQRTF16(MI, MRI, B);
return false;
}
@@ -5420,13 +5516,10 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
LLT Ty = MRI.getType(Dst);
- const fltSemantics *FltSemantics;
- if (Ty.isScalar(32))
- FltSemantics = &APFloat::IEEEsingle();
- else if (Ty.isScalar(64))
- FltSemantics = &APFloat::IEEEdouble();
- else
- return false;
+ if (!Ty.isFloat())
+ return false;
+
+ const llvm::fltSemantics &FltSemantics = getFltSemanticForLLT(Ty);
auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
.addUse(Src)
@@ -5437,11 +5530,11 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
const bool UseIEEE = MFI->getMode().IEEE;
- auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(FltSemantics));
auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
- auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(FltSemantics, true));
if (UseIEEE)
B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
@@ -5529,15 +5622,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
if (Size < 32) {
- Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+ Src0 = B.buildAnyExt(I32, Src0).getReg(0);
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
- Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
+ Src1 = B.buildAnyExt(I32, Src1).getReg(0);
if (IID == Intrinsic::amdgcn_writelane)
- Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+ Src2 = B.buildAnyExt(I32, Src2).getReg(0);
- Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+ Register LaneOpDst = createLaneOp(Src0, Src1, Src2, I32);
B.buildTrunc(DstReg, LaneOpDst);
MI.eraseFromParent();
return true;
@@ -5546,7 +5639,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size % SplitSize != 0)
return false;
- LLT PartialResTy = LLT::scalar(SplitSize);
+ LLT PartialResTy = LLT::integer(SplitSize);
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
@@ -5594,7 +5687,7 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
LLT DstTy = MRI.getType(DstReg);
- LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
+ LLT IdxTy = LLT::integer(DstTy.getSizeInBits());
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
if (!loadInputValue(KernargPtrReg, B,
@@ -5618,15 +5711,13 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
Register NumRecords = MI.getOperand(4).getReg();
Register Flags = MI.getOperand(5).getReg();
- LLT S32 = LLT::scalar(32);
-
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- auto Unmerge = B.buildUnmerge(S32, Pointer);
+ auto Unmerge = B.buildUnmerge(I32, Pointer);
Register LowHalf = Unmerge.getReg(0);
Register HighHalf = Unmerge.getReg(1);
- auto AndMask = B.buildConstant(S32, 0x0000ffff);
- auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+ auto AndMask = B.buildConstant(I32, 0x0000ffff);
+ auto Masked = B.buildAnd(I32, HighHalf, AndMask);
MachineInstrBuilder NewHighHalf = Masked;
std::optional<ValueAndVReg> StrideConst =
@@ -5636,13 +5727,13 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
if (StrideConst) {
uint32_t StrideVal = StrideConst->Value.getZExtValue();
uint32_t ShiftedStrideVal = StrideVal << 16;
- ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ ShiftedStride = B.buildConstant(I32, ShiftedStrideVal);
} else {
- auto ExtStride = B.buildAnyExt(S32, Stride);
- auto ShiftConst = B.buildConstant(S32, 16);
- ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ auto ExtStride = B.buildAnyExt(I32, Stride);
+ auto ShiftConst = B.buildConstant(I32, 16);
+ ShiftedStride = B.buildShl(I32, ExtStride, ShiftConst);
}
- NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ NewHighHalf = B.buildOr(I32, Masked, ShiftedStride);
}
Register NewHighHalfReg = NewHighHalf.getReg(0);
B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
@@ -5701,7 +5792,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ auto Unmerge = B.buildUnmerge(I32, MI.getOperand(2).getReg());
Register Hi32 = Unmerge.getReg(1);
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
@@ -5721,7 +5812,6 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
Register BaseReg;
unsigned ImmOffset;
- const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = *B.getMRI();
std::tie(BaseReg, ImmOffset) =
@@ -5748,15 +5838,15 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
if (Overflow != 0) {
if (!BaseReg) {
- BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+ BaseReg = B.buildConstant(I32, Overflow).getReg(0);
} else {
- auto OverflowVal = B.buildConstant(S32, Overflow);
- BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+ auto OverflowVal = B.buildConstant(I32, Overflow);
+ BaseReg = B.buildAdd(I32, BaseReg, OverflowVal).getReg(0);
}
}
if (!BaseReg)
- BaseReg = B.buildConstant(S32, 0).getReg(0);
+ BaseReg = B.buildConstant(I32, 0).getReg(0);
return std::pair(BaseReg, ImmOffset);
}
@@ -5766,52 +5856,50 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Reg,
bool ImageStore) const {
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
- assert(StoreVT.isVector() && StoreVT.getElementType().isScalar(16));
+ assert(StoreVT.isVector() && StoreVT.getElementType().isFloat(16));
if (ST.hasUnpackedD16VMem()) {
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ auto Unmerge = B.buildUnmerge(I16, Reg);
SmallVector<Register, 4> WideRegs;
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
- WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+ WideRegs.push_back(B.buildAnyExt(I32, Unmerge.getReg(I)).getReg(0));
int NumElts = StoreVT.getNumElements();
- return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
+ return B.buildBuildVector(LLT::fixed_vector(NumElts, I32), WideRegs)
.getReg(0);
}
if (ImageStore && ST.hasImageStoreD16Bug()) {
if (StoreVT.getNumElements() == 2) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(S32, Reg).getReg(0);
+ Reg = B.buildBitcast(I32, Reg).getReg(0);
PackedRegs.push_back(Reg);
- PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
+ PackedRegs.resize(2, B.buildUndef(I32).getReg(0));
+ return B.buildBuildVector(V2I32, PackedRegs)
.getReg(0);
}
if (StoreVT.getNumElements() == 3) {
SmallVector<Register, 4> PackedRegs;
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ auto Unmerge = B.buildUnmerge(I16, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
- PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
- Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
- return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
+ PackedRegs.resize(6, B.buildUndef(I16).getReg(0));
+ Reg = B.buildBuildVector(V6I16, PackedRegs).getReg(0);
+ return B.buildBitcast(V3I32, Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
- auto Unmerge = B.buildUnmerge(S32, Reg);
+ Reg = B.buildBitcast(V2I32, Reg).getReg(0);
+ auto Unmerge = B.buildUnmerge(I32, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
- PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
+ PackedRegs.resize(4, B.buildUndef(I32).getReg(0));
+ return B.buildBuildVector(V4I32, PackedRegs)
.getReg(0);
}
@@ -5819,8 +5907,7 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
}
if (StoreVT.isFixedVector(3, 16)) {
- Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
- .getReg(0);
+ Reg = B.buildPadVectorWithUndefElements(V4I16, Reg).getReg(0);
}
return Reg;
}
@@ -5841,7 +5928,9 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
}
// Fixup illegal register types for i8 stores.
if (Ty.isScalar(8) || Ty.isScalar(16)) {
- Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
+ if (Ty.isFloat())
+ VData = B.buildBitcast(I16, VData).getReg(0);
+ Register AnyExt = B.buildAnyExt(I32, VData).getReg(0);
return AnyExt;
}
@@ -5866,7 +5955,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
LLT Ty = MRI.getType(VData);
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
- const LLT S32 = LLT::scalar(32);
MachineMemOperand *MMO = *MI.memoperands_begin();
const int MemSize = MMO->getSize().getValue();
@@ -5890,7 +5978,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
} else {
- VIndex = B.buildConstant(S32, 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -5978,7 +6066,6 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
const LLT MemTy = MMO->getMemoryType();
- const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
@@ -6004,7 +6091,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
VIndex = MI.getOperand(3 + OpOffset).getReg();
++OpOffset;
} else {
- VIndex = B.buildConstant(S32, 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -6081,20 +6168,24 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
if (IsTFE) {
unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
unsigned NumLoadDWords = NumValueDWords + 1;
- LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
+ LLT LoadTy = LLT::fixed_vector(NumLoadDWords, I32);
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
if (MemTy.getSizeInBits() < 32) {
- Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
+ Register ExtDst = B.getMRI()->createGenericVirtualRegister(I32);
B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
- B.buildTrunc(Dst, ExtDst);
+ if (Ty.isFloat()) {
+ B.buildBitcast(Dst, B.buildTrunc(I16, ExtDst));
+ } else {
+ B.buildTrunc(Dst, ExtDst);
+ }
} else if (NumValueDWords == 1) {
B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
} else {
SmallVector<Register, 5> LoadElts;
for (unsigned I = 0; I != NumValueDWords; ++I)
- LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
+ LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(I32));
LoadElts.push_back(StatusDst);
B.buildUnmerge(LoadElts, LoadDstReg);
LoadElts.truncate(NumValueDWords);
@@ -6102,23 +6193,33 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
}
} else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
(IsD16 && !Ty.isVector())) {
- Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
+ Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(I32);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- B.buildTrunc(Dst, LoadDstReg);
+ if (Ty.isFloat()) {
+ B.buildBitcast(Dst, B.buildTrunc(I16, LoadDstReg));
+ } else {
+ B.buildTrunc(Dst, LoadDstReg);
+ }
} else if (Unpacked && IsD16 && Ty.isVector()) {
- LLT UnpackedTy = Ty.changeElementSize(32);
+ LLT UnpackedTy = Ty.changeElementType(I32);
Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
// FIXME: G_TRUNC should work, but legalization currently fails
- auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
+ auto Unmerge = B.buildUnmerge(I32, LoadDstReg);
SmallVector<Register, 4> Repack;
for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
- B.buildMergeLikeInstr(Dst, Repack);
+
+ if (Ty.isFloatVector()) {
+ B.buildBitcast(Dst, B.buildMergeLikeInstr(Ty.changeElementType(I16), Repack));
+ } else {
+ B.buildMergeLikeInstr(Dst, Repack);
+ }
+
} else {
buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
AuxiliaryData, MMO, IsTyped, HasVIndex, B);
@@ -6251,7 +6352,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
VIndex = MI.getOperand(4 + OpOffset).getReg();
++OpOffset;
} else {
- VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+ VIndex = B.buildConstant(I32, 0).getReg(0);
}
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
@@ -6290,8 +6391,6 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
unsigned ArgOffset,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
bool IsA16, bool IsG16) {
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
auto EndIdx = Intr->VAddrEnd;
for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
@@ -6310,13 +6409,13 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
// Special handling of bias when A16 is on. Bias is of type half but
// occupies full 32-bit.
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ B.buildBuildVector(V2I16, {AddrReg, B.buildUndef(I16).getReg(0)})
.getReg(0));
} else {
assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
"Bias needs to be converted to 16 bit in A16 mode");
// Handle any gradient or coordinate operands that should not be packed
- AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+ AddrReg = B.buildBitcast(V2I16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
}
} else {
@@ -6331,12 +6430,12 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
// Check for _L to _LZ optimization
!MI.getOperand(ArgOffset + I + 1).isReg()) {
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ B.buildBuildVector(V2I16, {AddrReg, B.buildUndef(I16).getReg(0)})
.getReg(0));
} else {
PackedAddrs.push_back(
B.buildBuildVector(
- V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
+ V2I16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
.getReg(0));
++I;
}
@@ -6348,8 +6447,6 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
/// and replace the remaining operands with $noreg.
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int DimIdx, int NumVAddrs) {
- const LLT S32 = LLT::scalar(32);
- (void)S32;
SmallVector<Register, 8> AddrRegs;
for (int I = 0; I != NumVAddrs; ++I) {
MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
@@ -6362,7 +6459,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int NumAddrRegs = AddrRegs.size();
if (NumAddrRegs != 1) {
auto VAddr =
- B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
+ B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, I32), AddrRegs);
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
}
@@ -6402,10 +6499,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
MachineRegisterInfo *MRI = B.getMRI();
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
-
unsigned DMask = 0;
Register VData;
LLT Ty;
@@ -6425,9 +6518,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT AddrTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 =
- ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isScalar(16)) : GradTy.isScalar(16);
- const bool IsA16 = AddrTy.isScalar(16);
- const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isScalar(16);
+ ST.hasG16() ? (BaseOpcode->Gradients && GradTy.isInteger(16)) : GradTy.isInteger(16);
+ const bool IsA16 = AddrTy.isInteger(16);
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType().isFloat(16);
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6520,13 +6613,13 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (UsePartialNSA) {
// Pack registers that would go over NSAMaxSize into last VAddr register
LLT PackedAddrTy =
- LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
+ LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), I16);
auto Concat = B.buildConcatVectors(
PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
PackedRegs.resize(NSAMaxSize);
} else if (!UseNSA && PackedRegs.size() > 1) {
- LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
+ LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), I16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
@@ -6635,17 +6728,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsD16 && ST.hasUnpackedD16VMem()) {
RoundedTy =
- LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
- TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
- RegTy = S32;
+ LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), I32);
+ TFETy = LLT::fixed_vector(AdjustedNumElts + 1, I32);
+ RegTy = I32;
} else {
unsigned EltSize = EltTy.getSizeInBits();
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
RoundedTy = LLT::scalarOrVector(
- ElementCount::getFixed(RoundedSize / EltSize), EltSize);
- TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
- RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
+ ElementCount::getFixed(RoundedSize / EltSize), EltTy);
+ TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, I32);
+ RegTy = !IsTFE && EltSize == 16 ? V2I16 : I32;
}
// The return type does not need adjustment.
@@ -6681,10 +6774,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MI.removeOperand(1);
// Handle the easy case that requires no repack instructions.
- if (Ty.isScalar(32)) {
+ if (Ty.isInteger(32)) {
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
return true;
}
+
+ if (Ty.isFloat(32)) {
+ auto DstI32 = MRI->createGenericVirtualRegister(I32);
+ B.buildUnmerge({DstI32, Dst1Reg}, NewResultReg);
+ B.buildBitcast(DstReg, DstI32);
+ return true;
+ }
}
// Now figure out how to copy the new result register back into the old
@@ -6731,10 +6831,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// cast for the TFE result if a multiple of v2s16 was used.
if (!RegTy.isFixedVector(2, 16) && !ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
- Reg = B.buildBitcast(V2S16, Reg).getReg(0);
+ Reg = B.buildBitcast(V2I16, Reg).getReg(0);
} else if (ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
- Reg = B.buildTrunc(S16, Reg).getReg(0);
+ Reg = B.buildTrunc(I16, Reg).getReg(0);
}
}
@@ -6763,8 +6863,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (ResultRegs.size() == 1) {
NewResultReg = ResultRegs[0];
} else if (ResultRegs.size() == 2) {
- LLT V4S16 = LLT::fixed_vector(4, 16);
- NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
+ NewResultReg = B.buildConcatVectors(V4I16, ResultRegs).getReg(0);
} else {
return false;
}
@@ -6801,7 +6900,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
// destination register.
- Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ Dst = B.getMRI()->createGenericVirtualRegister(I32);
} else {
Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
Dst = OrigDst;
@@ -6814,7 +6913,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
B.setInsertPt(B.getMBB(), MI);
}
- if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
+ if (shouldBitcastLoadStoreType(ST, Ty, LLT::integer(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
B.setInsertPt(B.getMBB(), MI);
@@ -6839,7 +6938,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
if (Dst != OrigDst) {
MI.getOperand(0).setReg(Dst);
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
- B.buildTrunc(OrigDst, Dst);
+
+ if (Ty.isFloat()) {
+ auto Trunc = B.buildTrunc(Ty.dropType(), Dst);
+ B.buildBitcast(OrigDst, Trunc);
+ } else {
+ B.buildTrunc(OrigDst, Dst);
+ }
}
// If we don't have 96-bit result scalar loads, widening to 128-bit should
@@ -6849,7 +6954,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
- Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
+ Helper.widenScalarDst(MI, getPow2IntegerType(Ty), 0);
}
Observer.changedInstr(MI);
@@ -6912,8 +7017,6 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
- const LLT S64 = LLT::scalar(64);
-
Register SGPR01(AMDGPU::SGPR0_SGPR1);
// For code object version 5, queue_ptr is passed through implicit kernarg.
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
@@ -6936,15 +7039,15 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- LLT::scalar(64), commonAlignment(Align(64), Offset));
+ I64, commonAlignment(Align(64), Offset));
// Pointer address
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
B.buildPtrAdd(LoadAddr, KernargPtrReg,
- B.buildConstant(LLT::scalar(64), Offset).getReg(0));
+ B.buildConstant(I64, Offset).getReg(0));
// Load address
- Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
+ Register Temp = B.buildLoad(I64, LoadAddr, *MMO).getReg(0);
B.buildCopy(SGPR01, Temp);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
@@ -7012,11 +7115,6 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
MachineIRBuilder &B) const {
MachineRegisterInfo &MRI = *B.getMRI();
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
- const LLT V2S16 = LLT::fixed_vector(2, 16);
- const LLT V3S32 = LLT::fixed_vector(3, 32);
-
Register DstReg = MI.getOperand(0).getReg();
Register NodePtr = MI.getOperand(2).getReg();
Register RayExtent = MI.getOperand(3).getReg();
@@ -7036,8 +7134,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
const bool IsGFX11 = AMDGPU::isGFX11(ST);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
- const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
- const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+ const bool IsA16 = MRI.getType(RayDir).getElementType().isFloat(16);
+ const bool Is64 = MRI.getType(NodePtr).isInteger(64);
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
@@ -7066,10 +7164,10 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
SmallVector<Register, 12> Ops;
if (UseNSA && IsGFX11Plus) {
- auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ auto packLanes = [&Ops, &B](Register Src) {
+ auto Unmerge = B.buildUnmerge({I32, I32, I32}, Src);
auto Merged = B.buildMergeLikeInstr(
- V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
+ V3I32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
Ops.push_back(Merged.getReg(0));
};
@@ -7078,20 +7176,22 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
+ auto BitcastRayDir = B.buildBitcast(V3I16, RayDir);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, BitcastRayDir);
+ auto BitcastRayInvDir = B.buildBitcast(V3I16, RayInvDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, BitcastRayInvDir);
auto MergedDir = B.buildMergeLikeInstr(
- V3S32,
+ V3I32,
{B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(0),
UnmergeRayDir.getReg(0)}))
.getReg(0),
B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(1),
UnmergeRayDir.getReg(1)}))
.getReg(0),
B.buildBitcast(
- S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
+ I32, B.buildMergeLikeInstr(V2I16, {UnmergeRayInvDir.getReg(2),
UnmergeRayDir.getReg(2)}))
.getReg(0)});
Ops.push_back(MergedDir.getReg(0));
@@ -7101,7 +7201,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
} else {
if (Is64) {
- auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+ auto Unmerge = B.buildUnmerge({I32, I32}, NodePtr);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
} else {
@@ -7109,8 +7209,9 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
Ops.push_back(RayExtent);
- auto packLanes = [&Ops, &S32, &B](Register Src) {
- auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
+ auto packLanes = [&Ops, &B](Register Src) {
+ auto Bitcast = B.buildBitcast(V3I32, Src);
+ auto Unmerge = B.buildUnmerge({I32, I32, I32}, Bitcast);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));
@@ -7118,11 +7219,13 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
- auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
- auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
- Register R1 = MRI.createGenericVirtualRegister(S32);
- Register R2 = MRI.createGenericVirtualRegister(S32);
- Register R3 = MRI.createGenericVirtualRegister(S32);
+ auto BitcastRayDir = B.buildBitcast(V3I16, RayDir);
+ auto UnmergeRayDir = B.buildUnmerge({I16, I16, I16}, BitcastRayDir);
+ auto BitcastRayInvDir = B.buildBitcast(V3I16, RayInvDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({I16, I16, I16}, BitcastRayInvDir);
+ Register R1 = MRI.createGenericVirtualRegister(I32);
+ Register R2 = MRI.createGenericVirtualRegister(I32);
+ Register R3 = MRI.createGenericVirtualRegister(I32);
B.buildMergeLikeInstr(R1,
{UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
B.buildMergeLikeInstr(
@@ -7140,7 +7243,7 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
if (!UseNSA) {
// Build a single vector containing all the operands so far prepared.
- LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
+ LLT OpTy = LLT::fixed_vector(Ops.size(), I32);
Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
Ops.clear();
Ops.push_back(MergedOps);
@@ -7177,11 +7280,10 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
if (!ST.hasArchitectedSGPRs())
return false;
- LLT S32 = LLT::scalar(32);
Register DstReg = MI.getOperand(0).getReg();
- auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
- auto LSB = B.buildConstant(S32, 25);
- auto Width = B.buildConstant(S32, 5);
+ auto TTMP8 = B.buildCopy(I32, Register(AMDGPU::TTMP8));
+ auto LSB = B.buildConstant(I32, 25);
+ auto Width = B.buildConstant(I32, 5);
B.buildUbfx(DstReg, TTMP8, LSB, Width);
MI.eraseFromParent();
return true;
@@ -7197,15 +7299,15 @@ bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
- if (MRI.getType(Src) != S64)
+ if (MRI.getType(Src) != I64)
return false;
auto ModeReg =
- B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
+ B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {I32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvModeBitField);
auto TrapReg =
- B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
+ B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {I32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvTrapBitField);
B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
@@ -7220,7 +7322,7 @@ bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
if (!MRI.getType(Src).isScalar(64))
return false;
- auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
+ auto Unmerge = B.buildUnmerge({I32, I32}, MI.getOperand(0));
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(static_cast<int16_t>(FPEnvModeBitField))
@@ -7524,18 +7626,16 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
Register Index = MI.getOperand(5).getReg();
- LLT S32 = LLT::scalar(32);
- if (!MRI.getType(Index).isScalar(32))
- MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ if (!MRI.getType(Index).isInteger(32))
+ MI.getOperand(5).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
Register Index = MI.getOperand(7).getReg();
- LLT S32 = LLT::scalar(32);
- if (!MRI.getType(Index).isScalar(32))
- MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ if (!MRI.getType(Index).isInteger(32))
+ MI.getOperand(7).setReg(B.buildAnyExt(I32, Index).getReg(0));
return true;
}
case Intrinsic::amdgcn_fmed3: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 86c15197805d23..2deda39224abea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -77,6 +77,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const;
+ bool legalizeFPExt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
+ bool legalizeFPTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
+
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 107d0f8c495032..bbff60b5a31870 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -226,20 +226,21 @@ bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
MachineInstr &MI) const {
- const LLT S32 = LLT::scalar(32);
+ const LLT I32 = LLT::integer(32);
+ const LLT F32 = LLT::float32();
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
- if (!SrcTy.isScalar(32))
- SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
+ if (!SrcTy.isInteger(32))
+ SrcReg = B.buildAnyExtOrTrunc(I32, SrcReg).getReg(0);
- if (Ty.isScalar(32)) {
+ if (Ty.isFloat(32)) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
MI.getFlags());
} else {
- auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
+ auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {F32}, {SrcReg},
MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}
>From 893365c5af375c6e77f21ff37da9bb56e681cafe Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Wed, 8 Jan 2025 14:43:26 +0000
Subject: [PATCH 09/11] add draft for InferTypeInfoPass
---
.../CodeGen/GlobalISel/InferTypeInfoPass.h | 43 +++
llvm/include/llvm/InitializePasses.h | 1 +
llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 +
llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp | 1 +
.../CodeGen/GlobalISel/InferTypeInfoPass.cpp | 298 ++++++++++++++++++
.../llvm/lib/CodeGen/GlobalISel/BUILD.gn | 1 +
6 files changed, 345 insertions(+)
create mode 100644 llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
create mode 100644 llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h b/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
new file mode 100644
index 00000000000000..7fbbe76c8ee6a8
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InferTypeInfoPass.h
@@ -0,0 +1,43 @@
+#ifndef LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
+#define LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class InferTypeInfo : public MachineFunctionPass {
+public:
+ static char ID;
+
+private:
+ MachineRegisterInfo *MRI = nullptr;
+ MachineFunction *MF = nullptr;
+
+ MachineIRBuilder Builder;
+
+ /// Initialize the field members using \p MF.
+ void init(MachineFunction &MF);
+
+public:
+ InferTypeInfo() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool inferTypeInfo(MachineFunction &MF);
+
+ bool shouldBeFP(MachineOperand &Op, unsigned Depth) const;
+
+ void updateDef(Register Reg);
+
+ void updateUse(MachineOperand &Op, bool FP);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_GLOBALISEL_INFERTYPEINFOPASS_H
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 1cb9013bc48cc5..c07735551be317 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -131,6 +131,7 @@ void initializeHardwareLoopsLegacyPass(PassRegistry &);
void initializeMIRProfileLoaderPassPass(PassRegistry &);
void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
void initializeIRTranslatorPass(PassRegistry &);
+void initializeInferTypeInfoPass(PassRegistry &);
void initializeIVUsersWrapperPassPass(PassRegistry &);
void initializeIfConverterPass(PassRegistry &);
void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index a45024d120be68..627b629bb7846e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMGlobalISel
GIMatchTableExecutor.cpp
GISelChangeObserver.cpp
IRTranslator.cpp
+ InferTypeInfoPass.cpp
InlineAsmLowering.cpp
InstructionSelect.cpp
InstructionSelector.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
index efcc40641ea80c..b23b9499b4972d 100644
--- a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -16,6 +16,7 @@ using namespace llvm;
void llvm::initializeGlobalISel(PassRegistry &Registry) {
initializeIRTranslatorPass(Registry);
+ initializeInferTypeInfoPass(Registry);
initializeLegalizerPass(Registry);
initializeLoadStoreOptPass(Registry);
initializeLocalizerPass(Registry);
diff --git a/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp b/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
new file mode 100644
index 00000000000000..32e729df7ceae1
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/InferTypeInfoPass.cpp
@@ -0,0 +1,298 @@
+//===- llvm/CodeGen/GlobalISel/InferTypeInfoPass.cpp - StripTypeInfoPass ---*-
+// C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the InferTypeInfoPass class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InferTypeInfoPass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "mir-infer-type-info"
+
+using namespace llvm;
+
+char InferTypeInfo::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InferTypeInfo, DEBUG_TYPE, "TODO", false, false)
+INITIALIZE_PASS_END(InferTypeInfo, DEBUG_TYPE, "TODO", false, false)
+
+void InferTypeInfo::init(MachineFunction &MF) {
+ this->MF = &MF;
+ MRI = &MF.getRegInfo();
+ Builder.setMF(MF);
+}
+
+void InferTypeInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static LLT updateType(LLT Ty, bool FP) {
+ LLT InferredScalarTy =
+ FP ? LLT::floatingPoint(Ty.getScalarSizeInBits(), LLT::FPInfo::IEEE_FLOAT)
+ : LLT::integer(Ty.getScalarSizeInBits());
+ LLT InferredTy =
+ Ty.isVector() ? Ty.changeElementType(InferredScalarTy) : InferredScalarTy;
+
+ return InferredTy;
+}
+
+void InferTypeInfo::updateDef(Register Reg) {
+ LLT Ty = MRI->getType(Reg);
+ LLT InferredTy = updateType(Ty, true);
+
+ MRI->setType(Reg, InferredTy);
+}
+
+void InferTypeInfo::updateUse(MachineOperand &Op, bool FP) {
+ assert(Op.isReg());
+ LLT Ty = MRI->getType(Op.getReg());
+ LLT InferredTy = updateType(Ty, FP);
+
+ MachineOperand *Def = MRI->getOneDef(Op.getReg());
+ MachineInstr *MI = Op.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+
+ Builder.setInsertPt(*MBB, MI);
+ auto Bitcast = Builder.buildBitcast(InferredTy, Def->getReg());
+ Op.setReg(Bitcast.getReg(0));
+}
+
+constexpr unsigned MaxFPRSearchDepth = 5;
+
+bool InferTypeInfo::shouldBeFP(MachineOperand &Op, unsigned Depth = 0) const {
+ if (Depth > MaxFPRSearchDepth)
+ return false;
+
+ if (!Op.isReg())
+ return false;
+
+ MachineInstr &MI = *Op.getParent();
+
+ auto Pred = [&](MachineOperand &O) { return shouldBeFP(O, Depth + 1); };
+
+ // TODO: cache FP registers
+
+ switch (MI.getOpcode()) {
+ // def and use fp instructions
+ case TargetOpcode::G_FABS:
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FCANONICALIZE:
+ case TargetOpcode::G_FCEIL:
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FCOPYSIGN:
+ case TargetOpcode::G_FCOS:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_FEXP2:
+ case TargetOpcode::G_FEXP:
+ case TargetOpcode::G_FFLOOR:
+ case TargetOpcode::G_FLOG10:
+ case TargetOpcode::G_FLOG2:
+ case TargetOpcode::G_FLOG:
+ case TargetOpcode::G_FMA:
+ case TargetOpcode::G_FMAD:
+ case TargetOpcode::G_FMAXIMUM:
+ case TargetOpcode::G_FMAXNUM:
+ case TargetOpcode::G_FMAXNUM_IEEE:
+ case TargetOpcode::G_FMINIMUM:
+ case TargetOpcode::G_FMINNUM:
+ case TargetOpcode::G_FMINNUM_IEEE:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FNEARBYINT:
+ case TargetOpcode::G_FNEG:
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FPOW:
+ case TargetOpcode::G_FPTRUNC:
+ case TargetOpcode::G_FREM:
+ case TargetOpcode::G_FRINT:
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FTAN:
+ case TargetOpcode::G_FACOS:
+ case TargetOpcode::G_FASIN:
+ case TargetOpcode::G_FATAN:
+ case TargetOpcode::G_FATAN2:
+ case TargetOpcode::G_FCOSH:
+ case TargetOpcode::G_FSINH:
+ case TargetOpcode::G_FTANH:
+ case TargetOpcode::G_FSQRT:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_INTRINSIC_ROUND:
+ case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ case TargetOpcode::G_VECREDUCE_FADD:
+ case TargetOpcode::G_VECREDUCE_FMUL:
+ case TargetOpcode::G_VECREDUCE_FMAX:
+ case TargetOpcode::G_VECREDUCE_FMIN:
+ case TargetOpcode::G_VECREDUCE_FMAXIMUM:
+ case TargetOpcode::G_VECREDUCE_FMINIMUM:
+ case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+ case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
+ return true;
+ // use only fp instructions
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ return Op.isDef();
+ // def only fp instructions
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI:
+ case TargetOpcode::G_FPTOSI_SAT:
+ case TargetOpcode::G_FPTOUI_SAT:
+ case TargetOpcode::G_FCMP:
+ case TargetOpcode::G_LROUND:
+ case TargetOpcode::G_LLROUND:
+ return Op.isUse();
+ case TargetOpcode::G_FREEZE:
+ case TargetOpcode::G_IMPLICIT_DEF:
+ case TargetOpcode::G_PHI:
+ case TargetOpcode::G_SELECT:
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_CONCAT_VECTORS:
+ case TargetOpcode::G_INSERT_SUBVECTOR:
+ case TargetOpcode::G_EXTRACT_SUBVECTOR:
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ case TargetOpcode::G_SPLAT_VECTOR:
+ case TargetOpcode::G_STEP_VECTOR:
+ case TargetOpcode::G_VECTOR_COMPRESS: {
+ return all_of(MI.all_defs(),
+ [&](MachineOperand &O) {
+ return all_of(MRI->use_operands(O.getReg()), Pred);
+ }) &&
+ all_of(MI.all_uses(), [&](MachineOperand &O) {
+ return all_of(MRI->def_operands(O.getReg()), Pred);
+ });
+ }
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &LHS = MI.getOperand(1);
+ MachineOperand &RHS = MI.getOperand(2);
+
+ return all_of(MRI->use_operands(Dst.getReg()), Pred) &&
+ (!LHS.isReg() || all_of(MRI->def_operands(LHS.getReg()), Pred)) &&
+ (!RHS.isReg() || all_of(MRI->def_operands(RHS.getReg()), Pred));
+ }
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_INDEXED_STORE: {
+ MachineOperand &Val = MI.getOperand(0);
+ return Op.getReg() == Val.getReg() && all_of(MRI->def_operands(Op.getReg()), Pred);
+ }
+ case TargetOpcode::G_INDEXED_LOAD:
+ case TargetOpcode::G_LOAD: {
+ MachineOperand &Dst = MI.getOperand(0);
+ return Op.getReg() == Dst.getReg() && all_of(MRI->use_operands(Dst.getReg()), Pred);
+ }
+ case TargetOpcode::G_ATOMICRMW_FADD:
+ case TargetOpcode::G_ATOMICRMW_FSUB:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN: {
+ MachineOperand &WriteBack = MI.getOperand(0);
+ MachineOperand &FPOp = MI.getOperand(2);
+ return Op.getReg() == WriteBack.getReg() || Op.getReg() == FPOp.getReg();
+ }
+ case TargetOpcode::G_INTRINSIC_CONVERGENT:
+ case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC: {
+ GIntrinsic *Intrinsic = dyn_cast<GIntrinsic>(&MI);
+ if (!Intrinsic)
+ return false;
+
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_sqrt:
+ case Intrinsic::amdgcn_fdot2_f16_f16:
+ case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+ }
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool InferTypeInfo::inferTypeInfo(MachineFunction &MF) {
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB.instrs()) {
+
+ for (auto &Def : MI.all_defs()) {
+ if (shouldBeFP(Def)) {
+ updateDef(Def.getReg());
+ Changed |= true;
+ }
+ }
+
+ for (auto &Use : MI.all_uses()) {
+ bool IsFPDef =
+ MRI->getVRegDef(Use.getReg()) &&
+ all_of(MRI->def_operands(Use.getReg()),
+ [&](MachineOperand &Op) { return shouldBeFP(Op); });
+ bool IsFPUse = shouldBeFP(Use);
+
+ if (IsFPUse && !IsFPDef) {
+ updateUse(Use, true);
+ Changed |= true;
+ } else if (!IsFPUse && IsFPDef) {
+ updateUse(Use, false);
+ Changed |= true;
+ }
+ }
+
+ for (auto &MemOp: MI.memoperands()) {
+ bool IsFP = any_of(MI.all_defs(), [&](MachineOperand &O){ return shouldBeFP(O); }) ||
+ any_of(MI.all_uses(), [&](MachineOperand &O){ return shouldBeFP(O); });
+
+ if (!IsFP)
+ continue;
+
+ LLT Ty = MemOp->getType();
+ LLT NewTy = updateType(Ty, true);
+ MemOp->setType(NewTy);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool InferTypeInfo::runOnMachineFunction(MachineFunction &MF) {
+ init(MF);
+ bool Changed = false;
+ Changed |= inferTypeInfo(MF);
+ return Changed;
+}
\ No newline at end of file
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index dc9e449195159a..37d1cf7e93aeaf 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -26,6 +26,7 @@ static_library("GlobalISel") {
"GISelKnownBits.cpp",
"GlobalISel.cpp",
"IRTranslator.cpp",
+ "InferTypeInfoPass.cpp",
"InlineAsmLowering.cpp",
"InstructionSelect.cpp",
"InstructionSelector.cpp",
>From bf5cc06c14bb09b1c28dbd0eec51bf6dd4692f53 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Fri, 10 Jan 2025 16:32:17 +0000
Subject: [PATCH 10/11] patch MIR parser
---
llvm/lib/CodeGen/MIRParser/MIParser.cpp | 51 ++++++++++++++++++++-----
1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index f77c4613ad801b..c73f5230a7645a 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
@@ -1923,13 +1924,19 @@ static bool verifyAddrSpace(uint64_t AddrSpace) {
}
bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
- if (Token.range().front() == 's' || Token.range().front() == 'p') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i' || Token.range().front() == 'f' || Token.range().front() == 'p') {
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 's'/'p' type character");
+ return error("expected integers after 's'/'i'/'f'/'p' type character");
+ }
+
+ if (Token.range().substr(0,2) == "bf") {
+ StringRef SizeStr = Token.range().drop_front(2);
+ if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+ return error("expected integers after 'bf' type string");
}
- if (Token.range().front() == 's') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i') {
auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
if (ScalarSize) {
if (!verifyScalarSize(ScalarSize))
@@ -1949,6 +1956,20 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
lex();
return false;
+ } else if (Token.range().front() == 'f') {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!ScalarSize || !verifyScalarSize(ScalarSize))
+ return error("invalid size for scalar type");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::IEEE_FLOAT);
+ lex();
+ return false;
+ } else if (Token.range().substr(0, 2) == "bf") {
+ auto ScalarSize = APSInt(Token.range().drop_front(2)).getZExtValue();
+ if (!ScalarSize || !verifyScalarSize(ScalarSize))
+ return error("invalid size for scalar type");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::VARIANT_FLOAT_1);
+ lex();
+ return false;
}
// Now we're looking for a vector.
@@ -1985,14 +2006,16 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
return GetError();
lex();
- if (Token.range().front() != 's' && Token.range().front() != 'p')
+ if (Token.range().front() != 's' && Token.range().front() != 'i' &&
+ Token.range().front() != 'f' && Token.range().front() != 'p' &&
+ Token.range().substr(0, 2) != "bf")
return GetError();
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 's'/'p' type character");
+ return error("expected integers after 's'/'i'/'f'/'p' type character");
- if (Token.range().front() == 's') {
+ if (Token.range().front() == 's' || Token.range().front() == 'i') {
auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
if (!verifyScalarSize(ScalarSize))
return error("invalid size for scalar element in vector");
@@ -2004,6 +2027,16 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
return error("invalid address space number");
Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+ } else if (Token.range().front() == 'f') {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!verifyScalarSize(ScalarSize))
+ return error("invalid size for float element in vector");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::IEEE_FLOAT);
+ } else if (Token.range().substr(0, 2) == "bf") {
+ auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+ if (!verifyScalarSize(ScalarSize))
+ return error("invalid size for bfloat element in vector");
+ Ty = LLT::floatingPoint(ScalarSize, LLT::FPInfo::VARIANT_FLOAT_1);
} else
return GetError();
lex();
@@ -2021,12 +2054,12 @@ bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
assert(Token.is(MIToken::Identifier));
StringRef TypeStr = Token.range();
if (TypeStr.front() != 'i' && TypeStr.front() != 's' &&
- TypeStr.front() != 'p')
+ TypeStr.front() != 'p' && TypeStr.front() != 'f' && TypeStr.substr(0,2) != "bf")
return error(
- "a typed immediate operand should start with one of 'i', 's', or 'p'");
+ "a typed immediate operand should start with one of 'i', 's','f','bf', or 'p'");
StringRef SizeStr = Token.range().drop_front();
if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
- return error("expected integers after 'i'/'s'/'p' type character");
+ return error("expected integers after 'i'/'s'/'f'/'bf'/'p' type character");
auto Loc = Token.location();
lex();
>From b72f910b822d18d13bd1c31f2c3ba675a92a8b75 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at brium.ai>
Date: Tue, 14 Jan 2025 14:38:38 +0000
Subject: [PATCH 11/11] LegalizerHelper FPInfo
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 474 ++++++++++--------
1 file changed, 254 insertions(+), 220 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 63cb2e6ef92b87..ad04eeb2730d6e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -74,7 +74,7 @@ getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
LLT::scalarOrVector(ElementCount::getFixed(LeftoverSize / EltSize),
OrigTy.getElementType());
} else {
- LeftoverTy = LLT::scalar(LeftoverSize);
+ LeftoverTy = LLT::integer(LeftoverSize);
}
int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
@@ -278,7 +278,7 @@ LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
// Shift the sign bit of the low register through the high register.
auto ShiftAmt =
- MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
+ MIRBuilder.buildConstant(LLT::integer(64), GCDTy.getSizeInBits() - 1);
PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
}
}
@@ -1075,7 +1075,7 @@ LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
const CmpInst::Predicate ICmpPred,
const DstOp &Res) -> Register {
// FCMP libcall always returns an i32, and needs an ICMP with #0.
- constexpr LLT TempLLT = LLT::scalar(32);
+ constexpr LLT TempLLT = LLT::integer(32);
Register Temp = MRI.createGenericVirtualRegister(TempLLT);
// Generate libcall, holding result in Temp
const auto Status = createLibcall(
@@ -1190,7 +1190,7 @@ LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
- auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
+ auto DefValue = MIRBuilder.buildConstant(LLT::integer(PtrSize), -1LL);
DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
MIRBuilder.buildIntToPtr(Dest, DefValue);
@@ -1473,7 +1473,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
SmallVector<Register, 1> LeftoverRegs;
if (LeftoverBits != 0) {
- LeftoverTy = LLT::scalar(LeftoverBits);
+ LeftoverTy = LLT::integer(LeftoverBits);
auto K = MIRBuilder.buildConstant(
LeftoverTy,
Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
@@ -1994,7 +1994,7 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
return Val;
const DataLayout &DL = MIRBuilder.getDataLayout();
- LLT NewTy = LLT::scalar(Ty.getSizeInBits());
+ LLT NewTy = LLT::integer(Ty.getSizeInBits());
if (Ty.isPointer()) {
if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
return Register();
@@ -2012,51 +2012,77 @@ Register LegalizerHelper::coerceToScalar(Register Val) {
void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned ExtOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
- LLT SrcTy = MRI.getType(MO.getReg());
+ Register Src = MO.getReg();
+ LLT Ty = MRI.getType(Src);
- if (SrcTy.isFloat() && ExtOpcode != TargetOpcode::G_FPEXT) {
- auto Cast = MIRBuilder.buildBitcast(SrcTy.dropType(), MO);
- auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {Cast});
- MO.setReg(ExtB.getReg(0));
- return;
- }
+ bool IsSrcFP = Ty.isFloat() || Ty.isFloatVector();
+ bool IsDstFP = WideTy.isFloat() || WideTy.isFloatVector();
+
+ if (IsSrcFP != IsDstFP)
+ Src = MIRBuilder.buildBitcast(Ty.dropType(), Src).getReg(0);
- auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
+ auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {Src});
MO.setReg(ExtB.getReg(0));
}
void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
unsigned OpIdx) {
MachineOperand &MO = MI.getOperand(OpIdx);
- auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
+ Register Src = MO.getReg();
+ LLT Ty = MRI.getType(Src);
+
+ bool IsSrcFP = Ty.isFloat() || Ty.isFloatVector();
+ bool IsDstFP = NarrowTy.isFloat() || NarrowTy.isFloatVector();
+
+ if (IsSrcFP != IsDstFP)
+ Src = MIRBuilder.buildBitcast(Ty.dropType(), Src).getReg(0);
+
+ auto ExtB = MIRBuilder.buildTrunc(NarrowTy, Src);
MO.setReg(ExtB.getReg(0));
}
void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
unsigned OpIdx, unsigned TruncOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
- LLT DstTy = MRI.getType(MO.getReg());
- Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+ Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+ Register Dst = MO.getReg();
+ LLT Ty = MRI.getType(Dst);
+
+ bool IsSrcFP = Ty.isFloat() || Ty.isFloatVector();
+ bool IsDstFP = WideTy.isFloat() || WideTy.isFloatVector();
MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
- if (DstTy.isFloat() && TruncOpcode != TargetOpcode::G_FPTRUNC) {
- auto Trunc = MIRBuilder.buildInstr(TruncOpcode, {DstTy.dropType()}, {DstExt});
+ if (IsSrcFP != IsDstFP) {
+ auto Trunc = MIRBuilder.buildInstr(TruncOpcode, {Ty.dropType()}, {DstExt});
MIRBuilder.buildBitcast(MO, Trunc);
- MO.setReg(DstExt);
- return;
+ } else {
+ MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
}
-
- MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
+
MO.setReg(DstExt);
}
+
void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
unsigned OpIdx, unsigned ExtOpcode) {
MachineOperand &MO = MI.getOperand(OpIdx);
Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
+ Register Dst = MO.getReg();
+ LLT Ty = MRI.getType(Dst);
+
+ bool IsSrcFP = Ty.isFloat() || Ty.isFloatVector();
+ bool IsDstFP = NarrowTy.isFloat() || NarrowTy.isFloatVector();
+
MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
- MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
+
+ if (IsSrcFP != IsDstFP) {
+ auto Ext = MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
+ MIRBuilder.buildBitcast(MO, Ext);
+ } else {
+ MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
+ }
+
MO.setReg(DstTrunc);
}
@@ -2162,12 +2188,12 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
// %10:_(s12) = G_MERGE_VALUES %8, %9
const int GCD = std::gcd(SrcSize, WideSize);
- LLT GCDTy = LLT::scalar(GCD);
+ LLT GCDTy = LLT::integer(GCD);
SmallVector<Register, 8> Parts;
SmallVector<Register, 8> NewMergeRegs;
SmallVector<Register, 8> Unmerges;
- LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
+ LLT WideDstTy = LLT::integer(NumMerge * WideSize);
// Decompose the original operands if they don't evenly divide.
for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
@@ -2237,7 +2263,7 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
return UnableToLegalize;
}
- SrcTy = LLT::scalar(SrcTy.getSizeInBits());
+ SrcTy = LLT::integer(SrcTy.getSizeInBits());
SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
}
@@ -2358,7 +2384,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
return UnableToLegalize;
- LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
+ LLT SrcAsIntTy = LLT::integer(SrcTy.getSizeInBits());
Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
SrcTy = SrcAsIntTy;
}
@@ -3794,7 +3820,7 @@ LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
// Check if bitcast is Legal
auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
- LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits());
+ LLT SrcScalTy = SrcTy.getScalarType();
// Check if the build vector is Legal
if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
@@ -3903,7 +3929,7 @@ LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
return UnableToLegalize;
Idx /= AdjustAmt;
- SrcTy = LLT::vector(SrcTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
+ SrcTy = LLT::vector(SrcTyEC.divideCoefficientBy(AdjustAmt), LLT::integer(AdjustAmt));
auto CastVec = MIRBuilder.buildBitcast(SrcTy, Src);
auto PromotedES = MIRBuilder.buildExtractSubvector(CastTy, CastVec, Idx);
MIRBuilder.buildBitcast(Dst, PromotedES);
@@ -3971,8 +3997,8 @@ LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
return UnableToLegalize;
Idx /= AdjustAmt;
- BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
- SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
+ BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), LLT::integer(AdjustAmt));
+ SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), LLT::integer(AdjustAmt));
auto CastBigVec = MIRBuilder.buildBitcast(BigVecTy, BigVec);
auto CastSubVec = MIRBuilder.buildBitcast(SubVecTy, SubVec);
auto PromotedIS =
@@ -4001,7 +4027,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
// Promote to a byte-sized load if not loading an integral number of
// bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
- LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
+ LLT WideMemTy = LLT::integer(MemStoreSizeInBits);
MachineMemOperand *NewMMO =
MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
@@ -4088,11 +4114,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
LLT PtrTy = MRI.getType(PtrReg);
unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
- LLT AnyExtTy = LLT::scalar(AnyExtSize);
+ LLT AnyExtTy = LLT::integer(AnyExtSize);
auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
PtrReg, *LargeMMO);
- auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
+ auto OffsetCst = MIRBuilder.buildConstant(LLT::integer(PtrTy.getSizeInBits()),
LargeSplitSize / 8);
Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
@@ -4141,7 +4167,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
// Promote to a byte-sized store with upper bits zero if not
// storing an integral number of bytes. For example, promote
// TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
- LLT WideTy = LLT::scalar(StoreSizeInBits);
+ LLT WideTy = LLT::integer(StoreSizeInBits);
if (StoreSizeInBits > SrcTy.getSizeInBits()) {
// Avoid creating a store with a narrower source than result.
@@ -4185,10 +4211,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
// e.g. an s56 store being broken into s32 + s24, we might have a stored type
// that's wider than the stored size.
unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
- const LLT NewSrcTy = LLT::scalar(AnyExtSize);
+ const LLT NewSrcTy = LLT::integer(AnyExtSize);
if (SrcTy.isPointer()) {
- const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
+ const LLT IntPtrTy = LLT::integer(SrcTy.getSizeInBits());
SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
}
@@ -4201,7 +4227,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
// Generate the PtrAdd and truncating stores.
LLT PtrTy = MRI.getType(PtrReg);
auto OffsetCst = MIRBuilder.buildConstant(
- LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
+ LLT::integer(PtrTy.getSizeInBits()), LargeSplitSize / 8);
auto SmallPtr =
MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
@@ -4231,7 +4257,7 @@ LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
// We need to build an integer scalar of the vector bit pattern.
// It's not legal for us to add padding when storing a vector.
unsigned NumBits = MemTy.getSizeInBits();
- LLT IntTy = LLT::scalar(NumBits);
+ LLT IntTy = LLT::integer(NumBits);
auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
@@ -5249,7 +5275,7 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
return UnableToLegalize;
LLT PtrTy = MRI.getType(AddrReg);
- const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+ const LLT OffsetTy = LLT::integer(PtrTy.getSizeInBits());
unsigned TotalSize = ValTy.getSizeInBits();
@@ -5620,7 +5646,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
SVOps.push_back(MIRBuilder
.buildExtractVectorElement(
EltTy, Inputs[Input],
- MIRBuilder.buildConstant(LLT::scalar(32), Idx))
+ MIRBuilder.buildConstant(LLT::integer(32), Idx))
.getReg(0));
}
@@ -5907,8 +5933,8 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
// input. If that isn't small enough, the resulting pieces will be further
// legalized.
const unsigned NewBitSize = DstEltSize / 2;
- const LLT HalfTy = LLT::scalar(NewBitSize);
- const LLT CondTy = LLT::scalar(1);
+ const LLT HalfTy = LLT::integer(NewBitSize);
+ const LLT CondTy = LLT::integer(1);
if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
@@ -6452,12 +6478,12 @@ void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
// Add all factors and accumulate all carries into CarrySum.
if (DstIdx != DstParts - 1) {
MachineInstrBuilder Uaddo =
- B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
+ B.buildUAddo(NarrowTy, LLT::integer(1), Factors[0], Factors[1]);
FactorSum = Uaddo.getReg(0);
CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
for (unsigned i = 2; i < Factors.size(); ++i) {
MachineInstrBuilder Uaddo =
- B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
+ B.buildUAddo(NarrowTy, LLT::integer(1), FactorSum, Factors[i]);
FactorSum = Uaddo.getReg(0);
MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
@@ -6547,7 +6573,7 @@ LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
if (i == e - 1 && CarryDst)
CarryOut = CarryDst;
else
- CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
+ CarryOut = MRI.createGenericVirtualRegister(LLT::integer(1));
if (!CarryIn) {
MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
@@ -6676,7 +6702,7 @@ LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
Register SegReg = SrcRegs[i];
if (ExtractOffset != 0 || SegSize != NarrowSize) {
// A genuine extract is needed.
- SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+ SegReg = MRI.createGenericVirtualRegister(LLT::integer(SegSize));
MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
}
@@ -6755,7 +6781,7 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
Register SegReg = OpReg;
if (ExtractOffset != 0 || SegSize != OpSize) {
// A genuine extract is needed.
- SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+ SegReg = MRI.createGenericVirtualRegister(LLT::integer(SegSize));
MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
}
@@ -6767,7 +6793,7 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
uint64_t WideSize = DstRegs.size() * NarrowSize;
Register DstReg = MI.getOperand(0).getReg();
if (WideSize > RegTy.getSizeInBits()) {
- Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
+ Register MergeReg = MRI.createGenericVirtualRegister(LLT::integer(WideSize));
MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
MIRBuilder.buildTrunc(DstReg, MergeReg);
} else
@@ -6901,7 +6927,7 @@ LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
// ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
auto C_0 = B.buildConstant(NarrowTy, 0);
- auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+ auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::integer(1),
UnmergeSrc.getReg(1), C_0);
auto LoCTLZ = IsUndef ?
B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
@@ -6934,7 +6960,7 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
// cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
auto C_0 = B.buildConstant(NarrowTy, 0);
- auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+ auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::integer(1),
UnmergeSrc.getReg(0), C_0);
auto HiCTTZ = IsUndef ?
B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
@@ -7486,11 +7512,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
- const LLT S1 = LLT::scalar(1);
+ const LLT I64 = LLT::integer(64);
+ const LLT I32 = LLT::integer(32);
+ const LLT I1 = LLT::integer(1);
- assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
+ assert(MRI.getType(Src).isInteger(64) && MRI.getType(Dst).isFloat(32));
// unsigned cul2f(ulong u) {
// uint lz = clz(u);
@@ -7502,38 +7528,39 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
// return as_float(v + r);
// }
- auto Zero32 = MIRBuilder.buildConstant(S32, 0);
- auto Zero64 = MIRBuilder.buildConstant(S64, 0);
+ auto Zero32 = MIRBuilder.buildConstant(I32, 0);
+ auto Zero64 = MIRBuilder.buildConstant(I64, 0);
- auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
+ auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(I32, Src);
- auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
- auto Sub = MIRBuilder.buildSub(S32, K, LZ);
+ auto K = MIRBuilder.buildConstant(I32, 127U + 63U);
+ auto Sub = MIRBuilder.buildSub(I32, K, LZ);
- auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
- auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
+ auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, I1, Src, Zero64);
+ auto E = MIRBuilder.buildSelect(I32, NotZero, Sub, Zero32);
- auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
- auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
+ auto Mask0 = MIRBuilder.buildConstant(I64, (-1ULL) >> 1);
+ auto ShlLZ = MIRBuilder.buildShl(I64, Src, LZ);
- auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
+ auto U = MIRBuilder.buildAnd(I64, ShlLZ, Mask0);
- auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
- auto T = MIRBuilder.buildAnd(S64, U, Mask1);
+ auto Mask1 = MIRBuilder.buildConstant(I64, 0xffffffffffULL);
+ auto T = MIRBuilder.buildAnd(I64, U, Mask1);
- auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
- auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
- auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
+ auto UShl = MIRBuilder.buildLShr(I64, U, MIRBuilder.buildConstant(I64, 40));
+ auto ShlE = MIRBuilder.buildShl(I32, E, MIRBuilder.buildConstant(I32, 23));
+ auto V = MIRBuilder.buildOr(I32, ShlE, MIRBuilder.buildTrunc(I32, UShl));
- auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
- auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
- auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
- auto One = MIRBuilder.buildConstant(S32, 1);
+ auto C = MIRBuilder.buildConstant(I64, 0x8000000000ULL);
+ auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, I1, T, C);
+ auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, I1, T, C);
+ auto One = MIRBuilder.buildConstant(I32, 1);
- auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
- auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
- auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
- MIRBuilder.buildAdd(Dst, V, R);
+ auto VTrunc1 = MIRBuilder.buildAnd(I32, V, One);
+ auto Select0 = MIRBuilder.buildSelect(I32, TCmp, VTrunc1, Zero32);
+ auto R = MIRBuilder.buildSelect(I32, RCmp, One, Select0);
+ auto Add = MIRBuilder.buildAdd(I32, V, R);
+ MIRBuilder.buildBitcast(Dst, Add);
MI.eraseFromParent();
return Legalized;
@@ -7544,30 +7571,30 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
- const LLT S1 = LLT::scalar(1);
+ const LLT I64 = LLT::integer(64);
+ const LLT F32 = LLT::float32();
+ const LLT I1 = LLT::integer(1);
- assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(32));
+ assert(MRI.getType(Src).isInteger(64) && MRI.getType(Dst).isFloat(32));
// For i64 < INT_MAX we simply reuse SITOFP.
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit
// saved before division, convert to float by SITOFP, multiply the result
// by 2.
- auto One = MIRBuilder.buildConstant(S64, 1);
- auto Zero = MIRBuilder.buildConstant(S64, 0);
+ auto One = MIRBuilder.buildConstant(I64, 1);
+ auto Zero = MIRBuilder.buildConstant(I64, 0);
// Result if Src < INT_MAX
- auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
+ auto SmallResult = MIRBuilder.buildSITOFP(F32, Src);
// Result if Src >= INT_MAX
- auto Halved = MIRBuilder.buildLShr(S64, Src, One);
- auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
- auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
- auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
- auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
+ auto Halved = MIRBuilder.buildLShr(I64, Src, One);
+ auto LowerBit = MIRBuilder.buildAnd(I64, Src, One);
+ auto RoundedHalved = MIRBuilder.buildOr(I64, Halved, LowerBit);
+ auto HalvedFP = MIRBuilder.buildSITOFP(F32, RoundedHalved);
+ auto LargeResult = MIRBuilder.buildFAdd(F32, HalvedFP, HalvedFP);
// Check if the original value is larger than INT_MAX by comparing with
// zero to pick one of the two conversions.
auto IsLarge =
- MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
+ MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, I1, Src, Zero);
MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
MI.eraseFromParent();
@@ -7579,10 +7606,11 @@ LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
+ const LLT F64 = LLT::float64();
+ const LLT I64 = LLT::integer(64);
+ const LLT I32 = LLT::integer(32);
- assert(MRI.getType(Src).isScalar(64) && MRI.getType(Dst).isScalar(64));
+ assert(MRI.getType(Src).isInteger(64) && MRI.getType(Dst).isFloat(64));
// We create double value from 32 bit parts with 32 exponent difference.
// Note that + and - are float operations that adjust the implicit leading
@@ -7593,18 +7621,18 @@ LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
// Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
// = - 2^52 * 1.0...HighBits
// Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
- auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
- auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
+ auto TwoP52 = MIRBuilder.buildConstant(I64, UINT64_C(0x4330000000000000));
+ auto TwoP84 = MIRBuilder.buildConstant(I64, UINT64_C(0x4530000000000000));
auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
- auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
- auto HalfWidth = MIRBuilder.buildConstant(S64, 32);
-
- auto LowBits = MIRBuilder.buildTrunc(S32, Src);
- LowBits = MIRBuilder.buildZExt(S64, LowBits);
- auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
- auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
- auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
- auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
+ auto TwoP52P84FP = MIRBuilder.buildFConstant(F64, TwoP52P84);
+ auto HalfWidth = MIRBuilder.buildConstant(I64, 32);
+
+ auto LowBits = MIRBuilder.buildTrunc(I32, Src);
+ LowBits = MIRBuilder.buildZExt(I64, LowBits);
+ auto LowBitsFP = MIRBuilder.buildOr(I64, TwoP52, LowBits);
+ auto HighBits = MIRBuilder.buildLShr(I64, Src, HalfWidth);
+ auto HighBitsFP = MIRBuilder.buildOr(I64, TwoP84, HighBits);
+ auto Scratch = MIRBuilder.buildFSub(F64, HighBitsFP, TwoP52P84FP);
MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
MI.eraseFromParent();
@@ -7622,17 +7650,17 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
return Legalized;
}
- if (!SrcTy.isScalar(64))
+ if (!SrcTy.isInteger(64))
return UnableToLegalize;
- if (DstTy.isScalar(32))
+ if (DstTy.isFloat(32))
// TODO: SelectionDAG has several alternative expansions to port which may
// be more reasonable depending on the available instructions. We also need
// a more advanced mechanism to choose an optimal version depending on
// target features such as sitofp or CTLZ availability.
return lowerU64ToF32WithSITOFP(MI);
- if (DstTy.isScalar(64))
+ if (DstTy.isFloat(64))
return lowerU64ToF64BitFloatOps(MI);
return UnableToLegalize;
@@ -7641,11 +7669,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- const LLT S64 = LLT::scalar(64);
- const LLT S32 = LLT::scalar(32);
- const LLT S1 = LLT::scalar(1);
+ const LLT I64 = LLT::integer(64);
+ const LLT I32 = LLT::integer(32);
+ const LLT F32 = LLT::float32();
+ const LLT I1 = LLT::integer(1);
- if (SrcTy.isScalar(1)) {
+ if (SrcTy.isInteger(1)) {
auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
MIRBuilder.buildSelect(Dst, Src, True, False);
@@ -7653,26 +7682,26 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
return Legalized;
}
- if (!SrcTy.isScalar(64))
+ if (!SrcTy.isInteger(64))
return UnableToLegalize;
- if (DstTy.isScalar(32)) {
+ if (DstTy.isFloat(32)) {
// signed cl2f(long l) {
// long s = l >> 63;
// float r = cul2f((l + s) ^ s);
// return s ? -r : r;
// }
Register L = Src;
- auto SignBit = MIRBuilder.buildConstant(S64, 63);
- auto S = MIRBuilder.buildAShr(S64, L, SignBit);
+ auto SignBit = MIRBuilder.buildConstant(I64, 63);
+ auto S = MIRBuilder.buildAShr(I64, L, SignBit);
- auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
- auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
- auto R = MIRBuilder.buildUITOFP(S32, Xor);
+ auto LPlusS = MIRBuilder.buildAdd(I64, L, S);
+ auto Xor = MIRBuilder.buildXor(I64, LPlusS, S);
+ auto R = MIRBuilder.buildUITOFP(F32, Xor);
- auto RNeg = MIRBuilder.buildFNeg(S32, R);
- auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
- MIRBuilder.buildConstant(S64, 0));
+ auto RNeg = MIRBuilder.buildFNeg(F32, R);
+ auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, I1, S,
+ MIRBuilder.buildConstant(I64, 0));
MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
MI.eraseFromParent();
return Legalized;
@@ -7684,9 +7713,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
- if (!SrcTy.isScalar(64) && !SrcTy.isScalar(32))
+ if (!SrcTy.isFloat(64) && !SrcTy.isFloat(32))
return UnableToLegalize;
- if (!DstTy.isScalar(32) && !DstTy.isScalar(64))
+ if (!DstTy.isInteger(32) && !DstTy.isInteger(64))
return UnableToLegalize;
// FPTOSI gives same result as FPTOUI for positive signed integers.
@@ -7709,10 +7738,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
- const LLT S1 = LLT::scalar(1);
+ const LLT I1 = LLT::integer(1);
MachineInstrBuilder FCMP =
- MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
+ MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, I1, Src, Threshold);
MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
MI.eraseFromParent();
@@ -7723,7 +7752,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
// FIXME: Only f32 to i64 conversions are supported.
- if (!SrcTy.getScalarType().isScalar(32) || !DstTy.getScalarType().isScalar(64))
+ if (!SrcTy.getScalarType().isFloat(32) || !DstTy.getScalarType().isInteger(64))
return UnableToLegalize;
// Expand f32 -> i64 conversion
@@ -7760,9 +7789,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
- const LLT S1 = LLT::scalar(1);
+ const LLT I1 = LLT::integer(1);
auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
- S1, Exponent, ExponentLoBit);
+ I1, Exponent, ExponentLoBit);
R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
@@ -7772,7 +7801,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
- S1, Exponent, ZeroSrcTy);
+ I1, Exponent, ZeroSrcTy);
auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
@@ -7884,19 +7913,22 @@ LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
// f64 -> f16 conversion using round-to-nearest-even rounding mode.
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
- const LLT S1 = LLT::scalar(1);
- const LLT S32 = LLT::scalar(32);
+ const LLT I1 = LLT::integer(1);
+ const LLT F32 = LLT::float32();
+ const LLT I16 = LLT::integer(16);
+ const LLT I32 = LLT::integer(32);
+ const LLT I64 = LLT::integer(64);
auto [Dst, Src] = MI.getFirst2Regs();
- assert(MRI.getType(Dst).getScalarType().isScalar(16) &&
- MRI.getType(Src).getScalarType().isScalar(64));
+ assert(MRI.getType(Dst).getScalarType().isFloat(16) &&
+ MRI.getType(Src).getScalarType().isFloat(64));
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
unsigned Flags = MI.getFlags();
- auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
+ auto Src32 = MIRBuilder.buildFPTrunc(F32, Src, Flags);
MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
MI.eraseFromParent();
return Legalized;
@@ -7906,93 +7938,94 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
const unsigned ExpBiasf64 = 1023;
const unsigned ExpBiasf16 = 15;
- auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
+ auto Unmerge = MIRBuilder.buildUnmerge(I32, MIRBuilder.buildBitcast(I64, Src));
Register U = Unmerge.getReg(0);
Register UH = Unmerge.getReg(1);
- auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
- E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
+ auto E = MIRBuilder.buildLShr(I32, UH, MIRBuilder.buildConstant(I32, 20));
+ E = MIRBuilder.buildAnd(I32, E, MIRBuilder.buildConstant(I32, ExpMask));
// Subtract the fp64 exponent bias (1023) to get the real exponent and
// add the f16 bias (15) to get the biased exponent for the f16 format.
E = MIRBuilder.buildAdd(
- S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
+ I32, E, MIRBuilder.buildConstant(I32, -ExpBiasf64 + ExpBiasf16));
- auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
- M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
+ auto M = MIRBuilder.buildLShr(I32, UH, MIRBuilder.buildConstant(I32, 8));
+ M = MIRBuilder.buildAnd(I32, M, MIRBuilder.buildConstant(I32, 0xffe));
- auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
- MIRBuilder.buildConstant(S32, 0x1ff));
- MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
+ auto MaskedSig = MIRBuilder.buildAnd(I32, UH,
+ MIRBuilder.buildConstant(I32, 0x1ff));
+ MaskedSig = MIRBuilder.buildOr(I32, MaskedSig, U);
- auto Zero = MIRBuilder.buildConstant(S32, 0);
- auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
- auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
- M = MIRBuilder.buildOr(S32, M, Lo40Set);
+ auto Zero = MIRBuilder.buildConstant(I32, 0);
+ auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, I1, MaskedSig, Zero);
+ auto Lo40Set = MIRBuilder.buildZExt(I32, SigCmpNE0);
+ M = MIRBuilder.buildOr(I32, M, Lo40Set);
// (M != 0 ? 0x0200 : 0) | 0x7c00;
- auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
- auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
- auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
+ auto Bits0x200 = MIRBuilder.buildConstant(I32, 0x0200);
+ auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, I1, M, Zero);
+ auto SelectCC = MIRBuilder.buildSelect(I32, CmpM_NE0, Bits0x200, Zero);
- auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
- auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
+ auto Bits0x7c00 = MIRBuilder.buildConstant(I32, 0x7c00);
+ auto I = MIRBuilder.buildOr(I32, SelectCC, Bits0x7c00);
// N = M | (E << 12);
- auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
- auto N = MIRBuilder.buildOr(S32, M, EShl12);
+ auto EShl12 = MIRBuilder.buildShl(I32, E, MIRBuilder.buildConstant(I32, 12));
+ auto N = MIRBuilder.buildOr(I32, M, EShl12);
// B = clamp(1-E, 0, 13);
- auto One = MIRBuilder.buildConstant(S32, 1);
- auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
- auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
- B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
+ auto One = MIRBuilder.buildConstant(I32, 1);
+ auto OneSubExp = MIRBuilder.buildSub(I32, One, E);
+ auto B = MIRBuilder.buildSMax(I32, OneSubExp, Zero);
+ B = MIRBuilder.buildSMin(I32, B, MIRBuilder.buildConstant(I32, 13));
- auto SigSetHigh = MIRBuilder.buildOr(S32, M,
- MIRBuilder.buildConstant(S32, 0x1000));
+ auto SigSetHigh = MIRBuilder.buildOr(I32, M,
+ MIRBuilder.buildConstant(I32, 0x1000));
- auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
- auto D0 = MIRBuilder.buildShl(S32, D, B);
+ auto D = MIRBuilder.buildLShr(I32, SigSetHigh, B);
+ auto D0 = MIRBuilder.buildShl(I32, D, B);
- auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
+ auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, I1,
D0, SigSetHigh);
- auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
- D = MIRBuilder.buildOr(S32, D, D1);
+ auto D1 = MIRBuilder.buildZExt(I32, D0_NE_SigSetHigh);
+ D = MIRBuilder.buildOr(I32, D, D1);
- auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
- auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
+ auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, I1, E, One);
+ auto V = MIRBuilder.buildSelect(I32, CmpELtOne, D, N);
- auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
- V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
+ auto VLow3 = MIRBuilder.buildAnd(I32, V, MIRBuilder.buildConstant(I32, 7));
+ V = MIRBuilder.buildLShr(I32, V, MIRBuilder.buildConstant(I32, 2));
- auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
- MIRBuilder.buildConstant(S32, 3));
- auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
+ auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, I1, VLow3,
+ MIRBuilder.buildConstant(I32, 3));
+ auto V0 = MIRBuilder.buildZExt(I32, VLow3Eq3);
- auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
- MIRBuilder.buildConstant(S32, 5));
- auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
+ auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, I1, VLow3,
+ MIRBuilder.buildConstant(I32, 5));
+ auto V1 = MIRBuilder.buildZExt(I32, VLow3Gt5);
- V1 = MIRBuilder.buildOr(S32, V0, V1);
- V = MIRBuilder.buildAdd(S32, V, V1);
+ V1 = MIRBuilder.buildOr(I32, V0, V1);
+ V = MIRBuilder.buildAdd(I32, V, V1);
- auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1,
- E, MIRBuilder.buildConstant(S32, 30));
- V = MIRBuilder.buildSelect(S32, CmpEGt30,
- MIRBuilder.buildConstant(S32, 0x7c00), V);
+ auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, I1,
+ E, MIRBuilder.buildConstant(I32, 30));
+ V = MIRBuilder.buildSelect(I32, CmpEGt30,
+ MIRBuilder.buildConstant(I32, 0x7c00), V);
- auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
- E, MIRBuilder.buildConstant(S32, 1039));
- V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
+ auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, I1,
+ E, MIRBuilder.buildConstant(I32, 1039));
+ V = MIRBuilder.buildSelect(I32, CmpEGt1039, I, V);
// Extract the sign bit.
- auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
- Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
+ auto Sign = MIRBuilder.buildLShr(I32, UH, MIRBuilder.buildConstant(I32, 16));
+ Sign = MIRBuilder.buildAnd(I32, Sign, MIRBuilder.buildConstant(I32, 0x8000));
// Insert the sign bit
- V = MIRBuilder.buildOr(S32, Sign, V);
+ V = MIRBuilder.buildOr(I32, Sign, V);
- MIRBuilder.buildTrunc(Dst, V);
+ auto Trunc = MIRBuilder.buildTrunc(I16, V);
+ MIRBuilder.buildBitcast(Dst, Trunc);
MI.eraseFromParent();
return Legalized;
}
@@ -8001,7 +8034,7 @@ LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
auto [DstTy, SrcTy] = MI.getFirst2LLTs();
- if (DstTy.getScalarType().isScalar(16) && SrcTy.getScalarType().isScalar(64))
+ if (DstTy.getScalarType().isFloat(16) && SrcTy.getScalarType().isFloat(64))
return lowerFPTRUNC_F64_TO_F16(MI);
return UnableToLegalize;
@@ -8241,7 +8274,7 @@ LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
unsigned PartSize = Src0Ty.getSizeInBits();
- LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
+ LLT WideTy = LLT::integer(DstTy.getSizeInBits());
Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
for (unsigned I = 2; I != NumOps; ++I) {
@@ -8389,7 +8422,7 @@ LegalizerHelper::LegalizeResult
LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
MI.getFirst3RegLLTs();
- LLT IdxTy = LLT::scalar(32);
+ LLT IdxTy = LLT::integer(32);
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
Register Undef;
@@ -8441,7 +8474,7 @@ LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
MachinePointerInfo ValPtrInfo =
MachinePointerInfo::getUnknownStack(*MI.getMF());
- LLT IdxTy = LLT::scalar(32);
+ LLT IdxTy = LLT::integer(32);
LLT ValTy = VecTy.getElementType();
Align ValAlign = getStackTemporaryAlignment(ValTy);
@@ -8463,7 +8496,7 @@ LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
} else if (HasPassthru) {
auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
- {LLT::scalar(32)}, {Popcount});
+ {LLT::integer(32)}, {Popcount});
Register LastElmtPtr =
getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
@@ -8483,7 +8516,7 @@ LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
LLT MaskITy = MaskTy.getElementType();
auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
if (MaskITy.getSizeInBits() > 1)
- MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
+ MaskI = MIRBuilder.buildTrunc(LLT::integer(1), MaskI);
MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
@@ -8492,7 +8525,7 @@ LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
auto EndOfVector =
MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
auto AllLanesSelected = MIRBuilder.buildICmp(
- CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
+ CmpInst::ICMP_UGT, LLT::integer(1), OutPos, EndOfVector);
OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
{OutPos, EndOfVector});
ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
@@ -8515,7 +8548,7 @@ Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
Register AllocSize,
Align Alignment,
LLT PtrTy) {
- LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+ LLT IntPtrTy = LLT::integer(PtrTy.getSizeInBits());
auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
@@ -8614,8 +8647,8 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) {
(SrcTy.isScalar() ||
(SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
LLT SrcIntTy = SrcTy;
- if (!SrcTy.isScalar()) {
- SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
+ if (!SrcTy.isInteger()) {
+ SrcIntTy = LLT::integer(SrcTy.getSizeInBits());
SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
}
@@ -8695,13 +8728,13 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
LLT IntDstTy = DstTy;
- if (!DstTy.isScalar()) {
- IntDstTy = LLT::scalar(DstTy.getSizeInBits());
+ if (!DstTy.isInteger()) {
+ IntDstTy = LLT::integer(DstTy.getSizeInBits());
Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
}
- if (!InsertTy.isScalar()) {
- const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
+ if (!InsertTy.isInteger()) {
+ const LLT IntInsertTy = LLT::integer(InsertTy.getSizeInBits());
InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
}
@@ -9103,7 +9136,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
unsigned BitSize = SrcTy.getScalarSizeInBits();
const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
- LLT IntTy = LLT::scalar(BitSize);
+ LLT IntTy = LLT::integer(BitSize);
if (SrcTy.isVector())
IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
@@ -9262,7 +9295,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
bool IsEltPtr = DstTy.isPointerOrPointerVector();
if (IsEltPtr) {
- LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
+ LLT ScalarPtrTy = LLT::integer(DstTy.getScalarSizeInBits());
LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
@@ -9365,7 +9398,7 @@ LegalizerHelper::LegalizeResult
LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
Register SrcReg = MI.getOperand(1).getReg();
Register DestReg = MI.getOperand(0).getReg();
- LLT Ty = MRI.getType(SrcReg), IType = LLT::scalar(1);
+ LLT Ty = MRI.getType(SrcReg), IType = LLT::integer(1);
auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
auto ICmp = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, IType, SrcReg, Zero);
@@ -9379,12 +9412,13 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
+ LLT IntTy = Ty.dropType();
// Reset sign bit
- MIRBuilder.buildAnd(
- DstReg, SrcReg,
- MIRBuilder.buildConstant(
- Ty, APInt::getSignedMaxValue(Ty.getScalarSizeInBits())));
+ auto Bitcast = MIRBuilder.buildBitcast(IntTy, SrcReg);
+ auto SignMax = MIRBuilder.buildConstant(IntTy, APInt::getSignedMaxValue(Ty.getScalarSizeInBits()));
+ auto And = MIRBuilder.buildAnd(IntTy, Bitcast, SignMax);
+ MIRBuilder.buildBitcast(DstReg, And);
MI.eraseFromParent();
return Legalized;
@@ -9424,7 +9458,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
const Align A(MI.getOperand(2).getImm());
- LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
+ LLT PtrTyAsScalarTy = LLT::integer(PtrTy.getSizeInBits());
if (A > TLI.getMinStackArgumentAlignment()) {
Register AlignAmt =
MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
@@ -9481,11 +9515,11 @@ static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
// Use the largest scalar type whose alignment constraints are satisfied.
// We only need to check DstAlign here as SrcAlign is always greater or
// equal to DstAlign (or zero).
- Ty = LLT::scalar(64);
+ Ty = LLT::integer(64);
if (Op.isFixedDstAlign())
while (Op.getDstAlign() < Ty.getSizeInBytes() &&
!TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
- Ty = LLT::scalar(Ty.getSizeInBytes());
+ Ty = LLT::integer(Ty.getSizeInBytes());
assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
// FIXME: check for the largest legal type we can load/store to.
}
@@ -9500,8 +9534,8 @@ static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
// FIXME: check for mem op safety and legality of the types. Not all of
// SDAGisms map cleanly to GISel concepts.
if (NewTy.isVector())
- NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
- NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
+ NewTy = NewTy.getSizeInBits() > 64 ? LLT::integer(64) : LLT::integer(32);
+ NewTy = LLT::integer(llvm::bit_floor(NewTy.getSizeInBits() - 1));
unsigned NewTySize = NewTy.getSizeInBytes();
assert(NewTySize > 0 && "Could not find appropriate type");
@@ -9665,7 +9699,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
Register Ptr = Dst;
if (DstOff != 0) {
auto Offset =
- MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+ MIB.buildConstant(LLT::integer(PtrTy.getSizeInBits()), DstOff);
Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
}
@@ -9804,7 +9838,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
Register Offset;
if (CurrOffset != 0) {
LLT SrcTy = MRI.getType(Src);
- Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
+ Offset = MIB.buildConstant(LLT::integer(SrcTy.getSizeInBits()), CurrOffset)
.getReg(0);
LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
}
@@ -9903,7 +9937,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
if (CurrOffset != 0) {
LLT SrcTy = MRI.getType(Src);
auto Offset =
- MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
+ MIB.buildConstant(LLT::integer(SrcTy.getSizeInBits()), CurrOffset);
LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
}
LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
@@ -9921,7 +9955,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
if (CurrOffset != 0) {
LLT DstTy = MRI.getType(Dst);
auto Offset =
- MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
+ MIB.buildConstant(LLT::integer(DstTy.getSizeInBits()), CurrOffset);
StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
}
MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
More information about the llvm-commits
mailing list