[llvm] e580552 - AMDGPU/GlobalISel: Select v2s32->v2s16 G_TRUNC
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 17 06:20:24 PST 2020
Author: Matt Arsenault
Date: 2020-02-17T09:20:13-05:00
New Revision: e5805529bf037d692d6ae84f5b612b103f1d3cae
URL: https://github.com/llvm/llvm-project/commit/e5805529bf037d692d6ae84f5b612b103f1d3cae
DIFF: https://github.com/llvm/llvm-project/commit/e5805529bf037d692d6ae84f5b612b103f1d3cae.diff
LOG: AMDGPU/GlobalISel: Select v2s32->v2s16 G_TRUNC
It would be nice if there was a way to avoid the tied operand, but as
far as I can tell there isn't a way to use or with op_sel to achieve
this
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 92805d04adca..b47685990960 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1208,9 +1208,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- if (!DstTy.isScalar())
- return false;
-
const LLT S1 = LLT::scalar(1);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -1225,6 +1222,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
@@ -1233,6 +1232,71 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
const TargetRegisterClass *DstRC
= TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register LoReg = MRI->createVirtualRegister(DstRC);
+ Register HiReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(SrcReg, 0, AMDGPU::sub0);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(SrcReg, 0, AMDGPU::sub1);
+
+ if (IsVALU && STI.hasSDWA()) {
+ // Write the low 16-bits of the high element into the high 16-bits of the
+ // low element.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(HiReg) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(LoReg, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg0 = MRI->createVirtualRegister(DstRC);
+ Register TmpReg1 = MRI->createVirtualRegister(DstRC);
+ Register ImmReg = MRI->createVirtualRegister(DstRC);
+ if (IsVALU) {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
+ .addImm(16)
+ .addReg(HiReg);
+ } else {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addReg(HiReg)
+ .addImm(16);
+ }
+
+ unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+ unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
+
+ BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
+ .addImm(0xffff);
+ BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
+ .addReg(LoReg)
+ .addReg(ImmReg);
+ BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
+ .addReg(TmpReg0)
+ .addReg(TmpReg1);
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ if (!DstTy.isScalar())
+ return false;
+
if (SrcSize > 32) {
int SubRegIdx = sizeToSubRegIndex(DstSize);
if (SubRegIdx == -1)
@@ -1240,17 +1304,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
// Deal with weird cases where the class only partially supports the subreg
// index.
- SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
- if (!SrcRC)
+ const TargetRegisterClass *SrcWithSubRC
+ = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+ if (!SrcWithSubRC)
return false;
- I.getOperand(1).setSubReg(SubRegIdx);
- }
+ if (SrcWithSubRC != SrcRC) {
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
+ return false;
+ }
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
- return false;
+ I.getOperand(1).setSubReg(SubRegIdx);
}
I.setDesc(TII.get(TargetOpcode::COPY));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir
new file mode 100644
index 000000000000..fba87c730465
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+
+---
+
+name: trunc_sgpr_v2s32_to_v2s16
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX6-LABEL: name: trunc_sgpr_v2s32_to_v2s16
+ ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX6: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc
+ ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GFX6: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+ ; GFX6: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX6: S_ENDPGM 0, implicit [[S_OR_B32_]]
+ ; GFX8-LABEL: name: trunc_sgpr_v2s32_to_v2s16
+ ; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+ ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; GFX8: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc
+ ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+ ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+ ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]]
+ %0:sgpr(<2 x s32>) =COPY $sgpr0_sgpr1
+ %1:sgpr(<2 x s16>) = G_TRUNC %0
+ S_ENDPGM 0, implicit %1
+...
+
+---
+
+name: trunc_vgpr_v2s32_to_v2s16
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX6-LABEL: name: trunc_vgpr_v2s32_to_v2s16
+ ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+ ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+ ; GFX6: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec
+ ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+ ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+ ; GFX6: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_LSHLREV_B32_e64_]], [[V_AND_B32_e64_]], implicit $exec
+ ; GFX6: S_ENDPGM 0, implicit [[V_OR_B32_e64_]]
+ ; GFX8-LABEL: name: trunc_vgpr_v2s32_to_v2s16
+ ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+ ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+ ; GFX8: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[COPY2]], 0, 5, 2, 4, implicit $exec, implicit [[COPY1]](tied-def 0)
+ ; GFX8: S_ENDPGM 0, implicit [[V_MOV_B32_sdwa]]
+ %0:vgpr(<2 x s32>) =COPY $vgpr0_vgpr1
+ %1:vgpr(<2 x s16>) = G_TRUNC %0
+ S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
new file mode 100644
index 000000000000..4809e530f92c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+
+define i16 @v_trunc_i32_to_i16(i32 %src) {
+; GFX7-LABEL: v_trunc_i32_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_i32_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %trunc = trunc i32 %src to i16
+ ret i16 %trunc
+}
+
+define amdgpu_ps i16 @s_trunc_i32_to_i16(i32 inreg %src) {
+; GFX7-LABEL: s_trunc_i32_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_trunc_i32_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: ; return to shader part epilog
+ %trunc = trunc i32 %src to i16
+ ret i16 %trunc
+}
+
+define i16 @v_trunc_i64_to_i16(i64 %src) {
+; GFX7-LABEL: v_trunc_i64_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_i64_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %trunc = trunc i64 %src to i16
+ ret i16 %trunc
+}
+
+define amdgpu_ps i16 @s_trunc_i64_to_i16(i64 inreg %src) {
+; GFX7-LABEL: s_trunc_i64_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_trunc_i64_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: ; return to shader part epilog
+ %trunc = trunc i64 %src to i16
+ ret i16 %trunc
+}
+
+define amdgpu_ps i16 @s_trunc_i128_to_i16(i128 inreg %src) {
+; GFX7-LABEL: s_trunc_i128_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_trunc_i128_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: ; return to shader part epilog
+ %trunc = trunc i128 %src to i16
+ ret i16 %trunc
+}
+
+define i16 @v_trunc_i128_to_i16(i128 %src) {
+; GFX7-LABEL: v_trunc_i128_to_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_i128_to_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %trunc = trunc i128 %src to i16
+ ret i16 %trunc
+}
+
+define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) {
+; GFX7-LABEL: v_trunc_v2i32_to_v2i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_v2i32_to_v2i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %trunc = trunc <2 x i32> %src to <2 x i16>
+ %cast = bitcast <2 x i16> %trunc to i32
+ ret i32 %cast
+}
+
+define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) {
+; GFX7-LABEL: s_trunc_v2i32_to_v2i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_trunc_v2i32_to_v2i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: ; return to shader part epilog
+ %trunc = trunc <2 x i32> %src to <2 x i16>
+ %cast = bitcast <2 x i16> %trunc to i32
+ ret i32 %cast
+}
More information about the llvm-commits
mailing list