[llvm] 96e51ed - [AMDGPU] Implement copyPhysReg for 16 bit subregs
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 7 14:22:57 PDT 2020
Author: Stanislav Mekhanoshin
Date: 2020-04-07T14:22:46-07:00
New Revision: 96e51ed005a960d1c746a0a7774ce255bb497ed5
URL: https://github.com/llvm/llvm-project/commit/96e51ed005a960d1c746a0a7774ce255bb497ed5
DIFF: https://github.com/llvm/llvm-project/commit/96e51ed005a960d1c746a0a7774ce255bb497ed5.diff
LOG: [AMDGPU] Implement copyPhysReg for 16 bit subregs
Differential Revision: https://reviews.llvm.org/D74937
Added:
llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index bd3741645c1d..fb1b4b9f5f03 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -679,6 +679,74 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (RC == &AMDGPU::VGPR_LO16RegClass || RC == &AMDGPU::VGPR_HI16RegClass) {
+ assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(SrcReg));
+
+ // d s
+ // l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2
+ // llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2
+ // l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d
+ // llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2
+ // h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d
+ // 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2
+ // h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2
+ // llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2
+
+ bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass;
+ bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg);
+ DestReg = RI.getMatchingSuperReg(DestReg,
+ DstLow ? AMDGPU::lo16 : AMDGPU::hi16,
+ &AMDGPU::VGPR_32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg,
+ SrcLow ? AMDGPU::lo16 : AMDGPU::hi16,
+ &AMDGPU::VGPR_32RegClass);
+
+ if (DestReg == SrcReg) {
+ // l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0]
+ // h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0]
+ if (DstLow == SrcLow)
+ return;
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg)
+ .addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0)
+ .addReg(DestReg, RegState::Undef)
+ .addImm(0) // src1_mod
+ .addImm(0) // src1
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+
+ return;
+ }
+
+ // Last instruction first:
+ auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg)
+ .addReg((SrcLow && !DstLow) ? SrcReg : DestReg,
+ (SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0)
+ .addReg((!SrcLow && DstLow) ? SrcReg : DestReg,
+ (!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0)
+ .addImm(2);
+
+ unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32
+ : SrcLow ? AMDGPU::V_LSHRREV_B32_e32
+ : AMDGPU::V_LSHLREV_B32_e32;
+ auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg);
+ if (DstLow == SrcLow) { // alignbyte
+ First.addReg(SrcLow ? SrcReg : DestReg,
+ SrcLow ? getKillRegState(KillSrc) : RegState::Undef)
+ .addReg(SrcLow ? DestReg : SrcReg,
+ SrcLow ? RegState::Undef :getKillRegState(KillSrc))
+ .addImm(2);
+ } else {
+ First.addImm(16)
+ .addReg(DestReg, RegState::Undef);
+ }
+
+ return;
+ }
+
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 06d05d349f36..deb127022ffa 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1279,6 +1279,8 @@ StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
const TargetRegisterClass *
SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
static const TargetRegisterClass *const BaseClasses[] = {
+ &AMDGPU::VGPR_LO16RegClass,
+ &AMDGPU::VGPR_HI16RegClass,
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass,
@@ -1318,6 +1320,9 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
switch (Size) {
+ case 16:
+ return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
+ getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
case 64:
diff --git a/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir b/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
new file mode 100644
index 000000000000..79d2f48421fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir
@@ -0,0 +1,202 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: {{^}}lo_to_lo:
+# GCN: v_alignbyte_b32 v1, v0, v1, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+name: lo_to_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_hi:
+# GCN: v_lshrrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+name: lo_to_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_hi16 = COPY killed $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_lo:
+# GCN: v_lshlrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+name: hi_to_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_hi:
+# GCN: v_alignbyte_b32 v1, v1, v0, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+name: hi_to_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_hi16 = COPY $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_lo_samereg:
+# GCN: s_waitcnt
+# GCN-NEXT: s_endpgm
+name: lo_to_lo_samereg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr0_lo16 = COPY $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_hi_samereg:
+# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0]
+name: lo_to_hi_samereg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr0_hi16 = COPY $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_lo_samereg:
+# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0]
+name: hi_to_lo_samereg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr0_lo16 = COPY killed $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_hi_samereg:
+# GCN: s_waitcnt
+# GCN-NEXT: s_endpgm
+name: hi_to_hi_samereg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr0_hi16 = COPY killed $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_lo_def_livein:
+# GCN: v_alignbyte_b32 v1, v0, v1, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+name: lo_to_lo_def_livein
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_hi_def_livein:
+# GCN: v_lshrrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+name: lo_to_hi_def_livein
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr1_hi16 = COPY $vgpr0_lo16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_lo_def_livein:
+# GCN: v_lshlrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+name: hi_to_lo_def_livein
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY killed $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}hi_to_hi_def_livein:
+# GCN: v_alignbyte_b32 v1, v1, v0, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+name: hi_to_hi_def_livein
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr1_hi16 = COPY $vgpr0_hi16
+ S_ENDPGM 0
+...
+
+# TODO: This can be coalesced into a VGPR_32 copy
+# GCN-LABEL: {{^}}lo_to_lo_hi_to_hi:
+# GCN: v_alignbyte_b32 v1, v0, v1, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2
+# GCN-NEXT: v_mov_b32_e32 v2, v1
+# GCN-NEXT: s_endpgm
+name: lo_to_lo_hi_to_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY $vgpr0_lo16
+ $vgpr1_hi16 = COPY $vgpr0_hi16
+ $vgpr2 = COPY killed $vgpr1
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: {{^}}lo_to_hi_hi_to_lo:
+# GCN: v_lshlrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2
+# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2
+# GCN-NEXT: v_mov_b32_e32 v2, v1
+# GCN-NEXT: s_endpgm
+name: lo_to_hi_hi_to_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1_lo16 = COPY $vgpr0_hi16
+ $vgpr1_hi16 = COPY $vgpr0_lo16
+ $vgpr2 = COPY killed $vgpr1
+ S_ENDPGM 0
+...
+
+# NB: copy of undef just killed instead of expansion
+# GCN-LABEL: {{^}}lo_to_lo_undef:
+# GCN: s_waitcnt
+# GCN-NEXT: v_mov_b32_e32 v2, v1
+# GCN-NEXT: s_endpgm
+name: lo_to_lo_undef
+tracksRegLiveness: true
+body: |
+ bb.0:
+ $vgpr1_lo16 = COPY undef $vgpr0_lo16
+ $vgpr2 = COPY killed $vgpr1
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list