[llvm] 758df22 - [AMDGPU][True16] Support emitting copies between different register sizes.
Ivan Kosarev via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 26 04:15:41 PDT 2023
Author: Ivan Kosarev
Date: 2023-09-26T12:15:34+01:00
New Revision: 758df22bcf21d20865806a8980bcd5f5e5ddf812
URL: https://github.com/llvm/llvm-project/commit/758df22bcf21d20865806a8980bcd5f5e5ddf812
DIFF: https://github.com/llvm/llvm-project/commit/758df22bcf21d20865806a8980bcd5f5e5ddf812.diff
LOG: [AMDGPU][True16] Support emitting copies between different register sizes.
Differential Revision: https://reviews.llvm.org/D156105
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/VOP1Instructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cf391856bf733fb..30e3179f8eb7d83 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -724,24 +724,39 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
+ unsigned Size = RI.getRegSizeInBits(*RC);
+ const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
+
+ // The rest of copyPhysReg assumes Src and Dst size are the same size.
+ // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
+ // we remove Fix16BitCopies and this code block?
+ if (Fix16BitCopies) {
+ if (((Size == 16) != (SrcSize == 16))) {
+ if (ST.hasTrue16BitInsts()) {
+ // Non-VGPR Src and Dst will later be expanded back to 32 bits.
+ MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
+ MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
+ RegToFix = SubReg;
+ } else {
+ MCRegister &RegToFix = (Size == 16) ? DestReg : SrcReg;
+ MCRegister Super = RI.get32BitRegister(RegToFix);
+ assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix ||
+ RI.getSubReg(Super, AMDGPU::hi16) == RegToFix);
+ RegToFix = Super;
+ }
- // FIXME: This is hack to resolve copies between 16 bit and 32 bit
- // registers until all patterns are fixed.
- if (Fix16BitCopies &&
- ((RI.getRegSizeInBits(*RC) == 16) ^
- (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) {
- MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
- MCRegister Super = RI.get32BitRegister(RegToFix);
- assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
- RegToFix = Super;
-
- if (DestReg == SrcReg) {
- // Insert empty bundle since ExpandPostRA expects an instruction here.
- BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
- return;
+ if (DestReg == SrcReg) {
+ // Identity copy. Insert empty bundle since ExpandPostRA expects an
+ // instruction here.
+ BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+ return;
+ }
+ RC = RI.getPhysRegBaseClass(DestReg);
+ Size = RI.getRegSizeInBits(*RC);
+ SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ SrcSize = RI.getRegSizeInBits(*SrcRC);
}
-
- RC = RI.getPhysRegBaseClass(DestReg);
}
if (RC == &AMDGPU::VGPR_32RegClass) {
@@ -865,10 +880,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const unsigned Size = RI.getRegSizeInBits(*RC);
if (Size == 16) {
- assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
- AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+ assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
@@ -906,6 +919,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (ST.hasTrue16BitInsts()) {
+ if (IsSGPRSrc) {
+ assert(SrcLow);
+ SrcReg = NewSrcReg;
+ }
+ // Use the smaller instruction encoding if possible.
+ if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
+ (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
+ .addReg(SrcReg);
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
+ .addImm(0) // src0_modifiers
+ .addReg(SrcReg)
+ .addImm(0); // op_sel
+ }
+ return;
+ }
+
if (IsSGPRSrc && !ST.hasSDWAScalar()) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
@@ -932,7 +964,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
if (ST.hasMovB64()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
@@ -1288,7 +1319,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.isAGPRClass(DstRC))
return AMDGPU::COPY;
- if (RI.getRegSizeInBits(*DstRC) == 32) {
+ if (RI.getRegSizeInBits(*DstRC) == 16) {
+ // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
+ // before RA.
+ return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
+ } else if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 00e4701e33bf521..734db326fb77ddd 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -656,6 +656,7 @@ let SubtargetPredicate = isGFX11Plus in {
getVOP1Pat64<int_amdgcn_permlane64,
VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1>;
+ defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
@@ -804,6 +805,7 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a,
defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
"V_FFBH_I32", "v_cls_i32">;
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
+defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x01c, "v_mov_b16">;
defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;
More information about the llvm-commits
mailing list