[llvm] [AMDGPU][True16][CodeGen] add a d16 predicate for true16 mode (PR #156574)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 3 09:04:09 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
There are some issues with D16 instructions in true16 mode and it's under investigation. Add a d16 predicate and disable D16 global/flat/scratch instructions for now.
---
Patch is 3.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156574.diff
70 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+13)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td (+6-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+124-27)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+10878-16664)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+286-300)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+4094-6684)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+92-125)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+69-124)
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+303-6)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+41-41)
- (modified) llvm/test/CodeGen/AMDGPU/flat-address-space.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll (+294-294)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.bf16.ll (+23-46)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+37-109)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+110-114)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+242-413)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+213-431)
- (modified) llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll (+46-94)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+50-117)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+90-8)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/icmp.i16.ll (+150-150)
- (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+15-14)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+88-85)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+80-164)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+114-230)
- (modified) llvm/test/CodeGen/AMDGPU/mad.u16.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+26-47)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll (+20-30)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+1167-2907)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+1123-2909)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/smed3.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spillv16.ll (+53-97)
- (modified) llvm/test/CodeGen/AMDGPU/strict_fpext.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/umed3.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/v_pack.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+8-17)
- (modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0e0b84f7e3374..44c3879d1f176 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -583,6 +583,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def FeatureRealTrueD16Insts : SubtargetFeature<"real-true-d16",
+ "EnableRealTrueD16Insts",
+ "true",
+ "Use D16 instructions with true 16-bit registere"
+>;
+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
@@ -2564,6 +2570,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+// Use D16 Insts in true16 mode
+def UseRealTrueD16Insts : TrueD16PredicateClass<"Subtarget->useRealTrueD16Insts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureRealTrueD16Insts)>;
+def NotUseRealTrueD16Insts : TrueD16PredicateClass<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->useRealTrueD16Insts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
index 7c990aa6b2eb6..43479afeb4c3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
@@ -16,15 +16,19 @@ def FalsePredicate : Predicate<"false">;
class True16PredicateClass<string cond> : Predicate<cond>;
def NoTrue16Predicate : True16PredicateClass<"">;
+class TrueD16PredicateClass<string cond> : Predicate<cond>;
+def NoTrueD16Predicate : TrueD16PredicateClass<"">;
+
class PredicateControl {
Predicate SubtargetPredicate = TruePredicate;
Predicate AssemblerPredicate = TruePredicate;
Predicate WaveSizePredicate = TruePredicate;
True16PredicateClass True16Predicate = NoTrue16Predicate;
+ TrueD16PredicateClass TrueD16Predicate = NoTrueD16Predicate;
list<Predicate> OtherPredicates = [];
list<Predicate> Predicates =
!foldl(OtherPredicates, [SubtargetPredicate, AssemblerPredicate,
- WaveSizePredicate, True16Predicate],
+ WaveSizePredicate, True16Predicate, TrueD16Predicate],
preds, p,
- preds # !listremove([p], [TruePredicate, NoTrue16Predicate] # preds));
+ preds # !listremove([p], [TruePredicate, NoTrue16Predicate, NoTrueD16Predicate] # preds));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..0e3524d7856b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}
+bool AMDGPUSubtarget::useRealTrueD16Insts() const {
+ return hasTrue16BitInsts() && useRealTrue16Insts() && EnableRealTrueD16Insts;
+}
+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..1f5e4cbc9142e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool EnableRealTrueD16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool useRealTrueD16Insts() const;
+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 19f95c5ac4c37..c56ba3c58ea74 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1328,6 +1328,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
@@ -1398,11 +1403,21 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
+class FlatLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
+>;
+
class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
(inst $vaddr, $offset, $cpol)
@@ -1551,6 +1566,11 @@ class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
(inst $vaddr, $offset)
>;
+class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1571,6 +1591,11 @@ class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
(inst $saddr, $offset)
>;
+class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $saddr, $offset), lo16)
+>;
+
class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
(inst $saddr, $offset, 0, $in)
@@ -1592,6 +1617,11 @@ class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
(inst $vaddr, $saddr, $offset, $cpol)
>;
+class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16)
+>;
+
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
@@ -1638,6 +1668,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_t16 <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_M0 <inst, node, vt> {
let AddedComplexity = 10;
@@ -1766,6 +1806,21 @@ multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTy
}
}
+multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : ScratchLoadSignedPat_t16 <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+
+ def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
+}
+
multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : ScratchStoreSignedPat <inst, node, vt> {
@@ -1837,6 +1892,15 @@ multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
}
}
+multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadPat_D16 <inst, node, vt>;
@@ -1907,14 +1971,26 @@ let True16Predicate = p in {
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ }
defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
@@ -2056,19 +2132,32 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
+ }
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
} // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
foreach vt = Reg32Types.types in {
@@ -2297,12 +2386,20 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
}
let True16Predicate = UseRealTrue16Insts in {
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>;
+ }
+ defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
+ defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
} // End True16Predicate = UseRealTrue16Insts
foreach vt = Reg32Types.types in {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..1dc53cec8df85 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15369,876 +15369,913 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/156574
More information about the llvm-commits
mailing list