[llvm] [AMDGPU][True16][CodeGen] uaddsat and usubsat selection in true16 flow globalISel (PR #118868)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 13:16:15 PST 2024
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/118868
>From f0e9e82ed3e0c07790df9f8af4ea71dd780314cd Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Dec 2024 15:43:58 -0500
Subject: [PATCH 1/2] Added VGPR_16 to GISEL register bank, support G_BUILD
GISEL in true16
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 15 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 55 +-
llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td | 2 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 20 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 18 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 555 +++++++++++++-----
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 555 +++++++++++++-----
9 files changed, 861 insertions(+), 365 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3be865f03df1fd..707ad0caa83042 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -700,9 +700,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
return true;
// TODO: This should probably be a combine somewhere
- // (build_vector $src0, undef) -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ if (Subtarget->useRealTrue16Insts() && IsVector) {
+ // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
+ // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
+ .addReg(Undef)
+ .addReg(Src0)
+ .addImm(AMDGPU::lo16);
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
+ }
+ // (build_vector $src0, undef) -> copy $src0
MI.setDesc(TII.get(AMDGPU::COPY));
MI.removeOperand(2);
const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2e66f7525b9ccf..7f91dd673d6c76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -223,8 +223,9 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
};
}
-static bool isRegisterSize(unsigned Size) {
- return Size % 32 == 0 && Size <= MaxRegisterSize;
+static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
+ return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
+ Size <= MaxRegisterSize;
}
static bool isRegisterVectorElementType(LLT EltTy) {
@@ -240,8 +241,8 @@ static bool isRegisterVectorType(LLT Ty) {
}
// TODO: replace all uses of isRegisterType with isRegisterClassType
-static bool isRegisterType(LLT Ty) {
- if (!isRegisterSize(Ty.getSizeInBits()))
+static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
+ if (!isRegisterSize(ST, Ty.getSizeInBits()))
return false;
if (Ty.isVector())
@@ -252,19 +253,19 @@ static bool isRegisterType(LLT Ty) {
// Any combination of 32 or 64-bit elements up the maximum register size, and
// multiples of v2s16.
-static LegalityPredicate isRegisterType(unsigned TypeIdx) {
- return [=](const LegalityQuery &Query) {
- return isRegisterType(Query.Types[TypeIdx]);
+static LegalityPredicate isRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
+ return [=, &ST](const LegalityQuery &Query) {
+ return isRegisterType(ST, Query.Types[TypeIdx]);
};
}
// RegisterType that doesn't have a corresponding RegClass.
// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
// should be removed.
-static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
- return [=](const LegalityQuery &Query) {
+static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
+ return [=, &ST](const LegalityQuery &Query) {
LLT Ty = Query.Types[TypeIdx];
- return isRegisterType(Ty) &&
+ return isRegisterType(ST, Ty) &&
!SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
};
}
@@ -348,17 +349,19 @@ static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
V6S64, V7S64, V8S64, V16S64};
// Checks whether a type is in the list of legal register types.
-static bool isRegisterClassType(LLT Ty) {
+static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
if (Ty.isPointerOrPointerVector())
Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
- is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
+ is_contained(AllScalarTypes, Ty) ||
+ (ST.useRealTrue16Insts() && Ty == S16) ||
+ is_contained(AllS16Vectors, Ty);
}
-static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
- return [TypeIdx](const LegalityQuery &Query) {
- return isRegisterClassType(Query.Types[TypeIdx]);
+static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST, unsigned TypeIdx) {
+ return [&ST, TypeIdx](const LegalityQuery &Query) {
+ return isRegisterClassType(ST, Query.Types[TypeIdx]);
};
}
@@ -510,7 +513,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
- return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
+ return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
}
@@ -523,12 +526,12 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
if (Size != MemSizeInBits)
return Size <= 32 && Ty.isVector();
- if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+ if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
return true;
// Don't try to handle bitcasting vector ext loads for now.
return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
- (Size <= 32 || isRegisterSize(Size)) &&
+ (Size <= 32 || isRegisterSize(ST, Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
}
@@ -875,7 +878,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_BITCAST)
// Don't worry about the size constraint.
- .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
+ .legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1)))
.lower();
getActionDefinitionsBuilder(G_CONSTANT)
@@ -890,7 +893,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S16, S64);
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
- .legalIf(isRegisterClassType(0))
+ .legalIf(isRegisterClassType(ST, 0))
// s1 and s16 are special cases because they have legal operations on
// them, but don't really occupy registers in the normal way.
.legalFor({S1, S16})
@@ -1825,7 +1828,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampMaxNumElements(VecTypeIdx, S32, 32)
// TODO: Clamp elements for 64-bit vectors?
.moreElementsIf(
- isIllegalRegisterType(VecTypeIdx),
+ isIllegalRegisterType(ST, VecTypeIdx),
moreElementsToNextExistingRegClass(VecTypeIdx))
// It should only be necessary with variable indexes.
// As a last resort, lower to the stack
@@ -1883,7 +1886,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampNumElements(0, V2S64, V16S64)
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
.moreElementsIf(
- isIllegalRegisterType(0),
+ isIllegalRegisterType(ST, 0),
moreElementsToNextExistingRegClass(0));
if (ST.hasScalarPackInsts()) {
@@ -1904,11 +1907,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
}
- BuildVector.legalIf(isRegisterType(0));
+ BuildVector.legalIf(isRegisterType(ST, 0));
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
.clampMaxNumElements(0, S32, 32)
.clampMaxNumElements(1, S16, 2) // TODO: Make 4?
.clampMaxNumElements(0, S16, 64);
@@ -1933,7 +1936,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
auto &Builder = getActionDefinitionsBuilder(Op)
- .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
.lowerFor({{S16, V2S16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
@@ -3149,7 +3152,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
} else {
// Extract the subvector.
- if (isRegisterType(ValTy)) {
+ if (isRegisterType(ST, ValTy)) {
// If this a case where G_EXTRACT is legal, use it.
// (e.g. <3 x s32> -> <4 x s32>)
WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 2d8dc9d47225ee..1c1a6dac75a17b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
+ [VGPR_16_Lo128, VGPR_16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc25d75131cc35..c822a3ea0d8667 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3275,6 +3275,8 @@ def : GCNPat <
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3284,6 +3286,7 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
+}
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3293,6 +3296,8 @@ def : GCNPat <
}
let SubtargetPredicate = HasVOP3PInsts in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3322,12 +3327,25 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
+ (REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
+>;
+// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
+def : GCNPat <
+ (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+>;
+}
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3353,6 +3371,8 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 049f4af4dd2f93..86a1b74e256322 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -35,7 +35,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
-std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
@@ -343,9 +343,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
static auto InitializeRegSplitPartsOnce = [this]() {
for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
unsigned Size = getSubRegIdxSize(Idx);
- if (Size & 31)
+ if (Size & 15)
continue;
- std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
+ std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
unsigned Pos = getSubRegIdxOffset(Idx);
if (Pos % Size)
continue;
@@ -3561,14 +3561,14 @@ bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const {
const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
- assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
+ assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
- const unsigned RegDWORDs = RegBitWidth / 32;
- const unsigned EltDWORDs = EltSize / 4;
- assert(RegSplitParts.size() + 1 >= EltDWORDs);
+ const unsigned RegHalves = RegBitWidth / 16;
+ const unsigned EltHalves = EltSize / 2;
+ assert(RegSplitParts.size() + 1 >= EltHalves);
- const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
- const unsigned NumParts = RegDWORDs / EltDWORDs;
+ const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
+ const unsigned NumParts = RegHalves / EltHalves;
return ArrayRef(Parts.data(), NumParts);
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8e481e3ac23043..60ae9948cbc699 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -37,11 +37,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
BitVector RegPressureIgnoredUnits;
/// Sub reg indexes for getRegSplitParts.
- /// First index represents subreg size from 1 to 16 DWORDs.
+ /// First index represents subreg size from 1 to 32 Half DWORDS.
/// The inner vector is sorted by bit offset.
/// Provided a register can be fully split with given subregs,
/// all elements of the inner vector combined give a full lane mask.
- static std::array<std::vector<int16_t>, 16> RegSplitParts;
+ static std::array<std::vector<int16_t>, 32> RegSplitParts;
// Table representing sub reg of given width and offset.
// First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 70deaf82dfaf34..e1e8880e2138d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2427,6 +2427,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
+ case AMDGPU::VGPR_16RegClassID:
+ case AMDGPU::VGPR_16_Lo128RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
case AMDGPU::AGPR_LO16RegClassID:
return 16;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 788692c94b0cfa..79dedf34845fdf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_uaddsat_i7:
@@ -35,14 +36,32 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_uaddsat_i7:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i7:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 9, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i7:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 9, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
}
@@ -78,14 +97,32 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_uaddsat_i7:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b32 s0, s0, 9
+; GFX10-NEXT: s_lshl_b32 s1, s1, 9
+; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i7:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i7:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
}
@@ -120,14 +157,32 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_uaddsat_i8:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
}
@@ -163,14 +218,32 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_uaddsat_i8:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
-; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b32 s0, s0, 8
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
}
@@ -246,25 +319,40 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_uaddsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uaddsat_v2i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_v2i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -356,29 +444,50 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_uaddsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s1, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: v_pk_add_u16 v0, s0, s1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_uaddsat_v2i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_v2i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -520,36 +629,69 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_uaddsat_v4i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
-; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
-; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, v3 clamp
-; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
-; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uaddsat_v4i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v2, v1 clamp
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_v4i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v7, v1, 16
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, v3 clamp
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-FAKE16-NEXT: v_or3_b32 v0, v1, v3, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
%result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -723,46 +865,89 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_uaddsat_v4i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s1, 24
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
-; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
-; GFX11-NEXT: s_lshr_b32 s4, s2, 16
-; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
-; GFX11-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX11-NEXT: s_lshl_b32 s4, s4, 8
-; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
-; GFX11-NEXT: s_lshl_b32 s5, s5, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-NEXT: v_pk_add_u16 v0, s2, s3 clamp
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s4, s4, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s5, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_u16 v1, s0, s1 clamp
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_uaddsat_v4i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 24
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s2, 16
+; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_v4i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s2, 16
+; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
%result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -1755,11 +1940,23 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_uaddsat_i16:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l clamp
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
}
@@ -1789,11 +1986,23 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_uaddsat_i16:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
}
@@ -1819,10 +2028,20 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: uaddsat_i16_sv:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, v0 clamp
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: uaddsat_i16_sv:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u16 v0, s0, v0 clamp
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: uaddsat_i16_sv:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, v0.l clamp
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: uaddsat_i16_sv:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, v0 clamp
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
ret half %cast
@@ -1849,10 +2068,20 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: uaddsat_i16_vs:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_add_nc_u16 v0, v0, s0 clamp
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: uaddsat_i16_vs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_nc_u16 v0, v0, s0 clamp
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: uaddsat_i16_vs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, s0 clamp
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: uaddsat_i16_vs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, s0 clamp
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
ret half %cast
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..a8e34d08618d88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_usubsat_i7:
@@ -34,14 +35,32 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_usubsat_i7:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
+; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i7:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 9, v1.l
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i7:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 9, v1
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
}
@@ -76,14 +95,32 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_usubsat_i7:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b32 s0, s0, 9
+; GFX10-NEXT: s_lshl_b32 s1, s1, 9
+; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i7:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i7:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
}
@@ -117,14 +154,32 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_usubsat_i8:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
}
@@ -159,14 +214,32 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_usubsat_i8:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
-; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b32 s0, s0, 8
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
}
@@ -240,25 +313,40 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_usubsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_usubsat_v2i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_v2i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -348,29 +436,50 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_usubsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s1, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_usubsat_v2i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_v2i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -508,36 +617,69 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_usubsat_v4i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
-; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
-; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
-; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
-; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0
-; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_usubsat_v4i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, v2, v1 clamp
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_v4i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v7, v1, 16
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-FAKE16-NEXT: v_or3_b32 v0, v1, v3, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
%result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -707,46 +849,89 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_usubsat_v4i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s1, 24
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2
-; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4
-; GFX11-NEXT: s_lshr_b32 s4, s2, 16
-; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5
-; GFX11-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX11-NEXT: s_lshl_b32 s4, s4, 8
-; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
-; GFX11-NEXT: s_lshl_b32 s5, s5, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s4, s4, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s5, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_usubsat_v4i8:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 24
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s2, 16
+; GFX11-TRUE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_v4i8:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s2, 16
+; GFX11-FAKE16-NEXT: s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8
+; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
%result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -1672,11 +1857,23 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_usubsat_i16:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l clamp
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
}
@@ -1705,11 +1902,23 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: s_usubsat_i16:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
}
@@ -1734,10 +1943,20 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: usubsat_i16_sv:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: usubsat_i16_sv:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: usubsat_i16_sv:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, v0.l clamp
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: usubsat_i16_sv:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
ret half %cast
@@ -1763,10 +1982,20 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10PLUS-LABEL: usubsat_i16_vs:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
-; GFX10PLUS-NEXT: ; return to shader part epilog
+; GFX10-LABEL: usubsat_i16_vs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: usubsat_i16_vs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, s0 clamp
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: usubsat_i16_vs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
ret half %cast
>From d0d75a996169c292d15d0967edea7116330389c7 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Dec 2024 16:15:52 -0500
Subject: [PATCH 2/2] run clang-format
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 162 ++++++++++--------
1 file changed, 86 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 7f91dd673d6c76..3af02a89ee6b51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -253,7 +253,8 @@ static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
// Any combination of 32 or 64-bit elements up the maximum register size, and
// multiples of v2s16.
-static LegalityPredicate isRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
+static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
+ unsigned TypeIdx) {
return [=, &ST](const LegalityQuery &Query) {
return isRegisterType(ST, Query.Types[TypeIdx]);
};
@@ -262,7 +263,8 @@ static LegalityPredicate isRegisterType(const GCNSubtarget &ST, unsigned TypeIdx
// RegisterType that doesn't have a corresponding RegClass.
// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
// should be removed.
-static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx) {
+static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
+ unsigned TypeIdx) {
return [=, &ST](const LegalityQuery &Query) {
LLT Ty = Query.Types[TypeIdx];
return isRegisterType(ST, Ty) &&
@@ -356,10 +358,11 @@ static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
is_contained(AllScalarTypes, Ty) ||
(ST.useRealTrue16Insts() && Ty == S16) ||
- is_contained(AllS16Vectors, Ty);
+ is_contained(AllS16Vectors, Ty);
}
-static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST, unsigned TypeIdx) {
+static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
+ unsigned TypeIdx) {
return [&ST, TypeIdx](const LegalityQuery &Query) {
return isRegisterClassType(ST, Query.Types[TypeIdx]);
};
@@ -1782,7 +1785,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
unsigned IdxTypeIdx = 2;
getActionDefinitionsBuilder(Op)
- .customIf([=](const LegalityQuery &Query) {
+ .customIf([=](const LegalityQuery &Query) {
const LLT EltTy = Query.Types[EltTypeIdx];
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
@@ -1803,36 +1806,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
IdxTy.getSizeInBits() == 32 &&
isLegalVecType;
})
- .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
- bitcastToVectorElement32(VecTypeIdx))
- //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
- .bitcastIf(
- all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
- [=](const LegalityQuery &Query) {
- // For > 64-bit element types, try to turn this into a 64-bit
- // element vector since we may be able to do better indexing
- // if this is scalar. If not, fall back to 32.
- const LLT EltTy = Query.Types[EltTypeIdx];
- const LLT VecTy = Query.Types[VecTypeIdx];
- const unsigned DstEltSize = EltTy.getSizeInBits();
- const unsigned VecSize = VecTy.getSizeInBits();
-
- const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
- return std::pair(
- VecTypeIdx,
- LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
- })
- .clampScalar(EltTypeIdx, S32, S64)
- .clampScalar(VecTypeIdx, S32, S64)
- .clampScalar(IdxTypeIdx, S32, S32)
- .clampMaxNumElements(VecTypeIdx, S32, 32)
- // TODO: Clamp elements for 64-bit vectors?
- .moreElementsIf(
- isIllegalRegisterType(ST, VecTypeIdx),
- moreElementsToNextExistingRegClass(VecTypeIdx))
- // It should only be necessary with variable indexes.
- // As a last resort, lower to the stack
- .lower();
+ .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
+ scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+ bitcastToVectorElement32(VecTypeIdx))
+ //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+ .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
+ scalarOrEltWiderThan(VecTypeIdx, 64)),
+ [=](const LegalityQuery &Query) {
+ // For > 64-bit element types, try to turn this into a
+ // 64-bit element vector since we may be able to do better
+ // indexing if this is scalar. If not, fall back to 32.
+ const LLT EltTy = Query.Types[EltTypeIdx];
+ const LLT VecTy = Query.Types[VecTypeIdx];
+ const unsigned DstEltSize = EltTy.getSizeInBits();
+ const unsigned VecSize = VecTy.getSizeInBits();
+
+ const unsigned TargetEltSize =
+ DstEltSize % 64 == 0 ? 64 : 32;
+ return std::pair(VecTypeIdx,
+ LLT::fixed_vector(VecSize / TargetEltSize,
+ TargetEltSize));
+ })
+ .clampScalar(EltTypeIdx, S32, S64)
+ .clampScalar(VecTypeIdx, S32, S64)
+ .clampScalar(IdxTypeIdx, S32, S32)
+ .clampMaxNumElements(VecTypeIdx, S32, 32)
+ // TODO: Clamp elements for 64-bit vectors?
+ .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
+ moreElementsToNextExistingRegClass(VecTypeIdx))
+ // It should only be necessary with variable indexes.
+ // As a last resort, lower to the stack
+ .lower();
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -1879,15 +1883,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
- auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V32S32)
- .clampNumElements(0, V2S64, V16S64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
- .moreElementsIf(
- isIllegalRegisterType(ST, 0),
- moreElementsToNextExistingRegClass(0));
+ auto &BuildVector =
+ getActionDefinitionsBuilder(G_BUILD_VECTOR)
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V32S32)
+ .clampNumElements(0, V2S64, V16S64)
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+ .moreElementsIf(isIllegalRegisterType(ST, 0),
+ moreElementsToNextExistingRegClass(0));
if (ST.hasScalarPackInsts()) {
BuildVector
@@ -1911,10 +1915,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
- .clampMaxNumElements(0, S32, 32)
- .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
- .clampMaxNumElements(0, S16, 64);
+ .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
+ .clampMaxNumElements(0, S32, 32)
+ .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
+ .clampMaxNumElements(0, S16, 64);
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
@@ -1935,34 +1939,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return false;
};
- auto &Builder = getActionDefinitionsBuilder(Op)
- .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
- .lowerFor({{S16, V2S16}})
- .lowerIf([=](const LegalityQuery &Query) {
- const LLT BigTy = Query.Types[BigTyIdx];
- return BigTy.getSizeInBits() == 32;
- })
- // Try to widen to s16 first for small types.
- // TODO: Only do this on targets with legal s16 shifts
- .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
- .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
- .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
- .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
- elementTypeIs(1, S16)),
- changeTo(1, V2S16))
- // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
- // worth considering the multiples of 64 since 2*192 and 2*384 are not
- // valid.
- .clampScalar(LitTyIdx, S32, S512)
- .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
- // Break up vectors with weird elements into scalars
- .fewerElementsIf(
- [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
- scalarize(0))
- .fewerElementsIf(
- [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
- scalarize(1))
- .clampScalar(BigTyIdx, S32, MaxScalar);
+ auto &Builder =
+ getActionDefinitionsBuilder(Op)
+ .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
+ .lowerFor({{S16, V2S16}})
+ .lowerIf([=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ return BigTy.getSizeInBits() == 32;
+ })
+ // Try to widen to s16 first for small types.
+ // TODO: Only do this on targets with legal s16 shifts
+ .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
+ .moreElementsIf(isSmallOddVector(BigTyIdx),
+ oneMoreElement(BigTyIdx))
+ .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
+ elementTypeIs(1, S16)),
+ changeTo(1, V2S16))
+ // Clamp the little scalar to s8-s256 and make it a power of 2. It's
+ // not worth considering the multiples of 64 since 2*192 and 2*384
+ // are not valid.
+ .clampScalar(LitTyIdx, S32, S512)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
+ // Break up vectors with weird elements into scalars
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) {
+ return notValidElt(Query, LitTyIdx);
+ },
+ scalarize(0))
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) {
+ return notValidElt(Query, BigTyIdx);
+ },
+ scalarize(1))
+ .clampScalar(BigTyIdx, S32, MaxScalar);
if (Op == G_MERGE_VALUES) {
Builder.widenScalarIf(
More information about the llvm-commits
mailing list