[llvm] ac7abe0 - AMDGPU/GlobalISel: Manually select G_BUILD_VECTOR_TRUNC
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 08:08:16 PST 2020
Author: Matt Arsenault
Date: 2020-02-21T10:34:11-05:00
New Revision: ac7abe0ba9ae4c6a2248cc3ef4e4fe7e6d270105
URL: https://github.com/llvm/llvm-project/commit/ac7abe0ba9ae4c6a2248cc3ef4e4fe7e6d270105
DIFF: https://github.com/llvm/llvm-project/commit/ac7abe0ba9ae4c6a2248cc3ef4e4fe7e6d270105.diff
LOG: AMDGPU/GlobalISel: Manually select G_BUILD_VECTOR_TRUNC
We have patterns for s_pack* selection, but they assume the inputs are
a build_vector with 16-bit inputs, not a truncating build
vector. Since there's still outstanding work for how to handle
mismatched result and source element vector operations, and since I'm
trying a different packed vector strategy than SelectionDAG, just
manually select this for now.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0dafa7606394..e504632f152e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -595,6 +595,90 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
return true;
}
+static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
+ int64_t Val;
+ return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
+}
+
+bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
+ MachineInstr &MI) const {
+ if (selectImpl(MI, *CoverageInfo))
+ return true;
+
+ const LLT S32 = LLT::scalar(32);
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI->getType(Dst) != V2S16)
+ return false;
+
+ const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstBank->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ if (MRI->getType(Src0) != S32)
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+
+ // TODO: This should probably be a combine somewhere
+ // (build_vector_trunc $src0, undef -> copy $src0
+ MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
+ if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ MI.setDesc(TII.get(AMDGPU::COPY));
+ MI.RemoveOperand(2);
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
+ }
+
+ Register ShiftSrc0;
+ Register ShiftSrc1;
+ int64_t ShiftAmt;
+
+ // With multiple uses of the shift, this will duplicate the shift and
+ // increase register pressure.
+ //
+ // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
+ // => (S_PACK_HH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
+ // => (S_PACK_LH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, $src1)
+ // => (S_PACK_LL_B32_B16 $src0, $src1)
+
+ // FIXME: This is an inconvenient way to check a specific value
+ bool Shift0 = mi_match(
+ Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ bool Shift1 = mi_match(
+ Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
+ if (Shift0 && Shift1) {
+ Opc = AMDGPU::S_PACK_HH_B32_B16;
+ MI.getOperand(1).setReg(ShiftSrc0);
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift1) {
+ Opc = AMDGPU::S_PACK_LH_B32_B16;
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift0 && isZero(Src1, *MRI)) {
+ // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+ .addReg(ShiftSrc0)
+ .addImm(16);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+
+ MI.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
return selectG_ADD_SUB(I);
}
@@ -2062,6 +2146,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_MERGE_VALUES(I);
case TargetOpcode::G_UNMERGE_VALUES:
return selectG_UNMERGE_VALUES(I);
+ case TargetOpcode::G_BUILD_VECTOR_TRUNC:
+ return selectG_BUILD_VECTOR_TRUNC(I);
case TargetOpcode::G_PTR_ADD:
return selectG_PTR_ADD(I);
case TargetOpcode::G_IMPLICIT_DEF:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 28ecb9b6286d..c0ba3a990ad1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -94,6 +94,7 @@ class AMDGPUInstructionSelector : public InstructionSelector {
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
+ bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const;
bool selectG_PTR_ADD(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
index 3abd70b80b7e..9ab7db5347ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir
@@ -1,5 +1,4 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# XFAIL: *
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
@@ -20,8 +19,8 @@ body: |
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
- %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
- S_ENDPGM 0, implicit %4
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+ S_ENDPGM 0, implicit %2
...
---
@@ -142,8 +141,9 @@ body: |
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_0_s32_s_s32
; GFX9: liveins: $sgpr0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX9: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], 16, implicit-def $scc
- ; GFX9: S_ENDPGM 0, implicit [[S_LSHL_B32_]]
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
+ ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = G_CONSTANT i32 0
@@ -153,29 +153,29 @@ body: |
...
---
-name: test_build_vector_v_v2s16_v_s32_s_undef_s32
+name: test_build_vector_trunc_s_v2s16_s_s32_s_undef_s32
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $sgpr0
- ; GFX9-LABEL: name: test_build_vector_v_v2s16_v_s32_s_undef_s32
- ; GFX9: liveins: $vgpr0
- ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_s32_s_undef_s32
+ ; GFX9: liveins: $sgpr0
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9: S_ENDPGM 0, implicit [[COPY]]
- %0:vgpr(s32) = COPY $vgpr0
+ %0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = G_IMPLICIT_DEF
- %2:vgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
-name: test_build_vector_trunc_s_v2s16_s_s32_s_undef_s32
+name: test_build_vector_trunc_s_v2s16_s_undef_s32_s_s32
legalized: true
regBankSelected: true
tracksRegLiveness: true
@@ -184,20 +184,44 @@ body: |
bb.0:
liveins: $sgpr0
- ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_s32_s_undef_s32
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_undef_s32_s_s32
; GFX9: liveins: $sgpr0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX9: S_ENDPGM 0, implicit [[COPY]]
+ ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[COPY]]
+ ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = G_IMPLICIT_DEF
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %1, %0
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_s_v2s16_s_undef_s_s32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr1
+
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_undef_s_s32
+ ; GFX9: liveins: $sgpr1
+ ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[COPY]]
+ ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+ %0:sgpr(s32) = G_IMPLICIT_DEF
+ %1:sgpr(s32) = COPY $sgpr1
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
-name: test_build_vector_trunc_s_v2s16_s_undef_s32_s_s32
+name: test_build_vector_trunc_s_v2s16_s_s32_undef
legalized: true
regBankSelected: true
tracksRegLiveness: true
@@ -206,15 +230,81 @@ body: |
bb.0:
liveins: $sgpr0
- ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_undef_s32_s_s32
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_s32_undef
; GFX9: liveins: $sgpr0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX9: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], 16, implicit-def $scc
- ; GFX9: S_ENDPGM 0, implicit [[S_LSHL_B32_]]
+ ; GFX9: S_ENDPGM 0, implicit [[COPY]]
%0:sgpr(s32) = COPY $sgpr0
-
%1:sgpr(s32) = G_IMPLICIT_DEF
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+ S_ENDPGM 0, implicit %2
+...
- %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %1, %0
+---
+name: test_build_vector_trunc_s_v2s16_s_zero_s_s32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr1
+
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_zero_s_s32
+ ; GFX9: liveins: $sgpr1
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
+ ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+ %0:sgpr(s32) = G_CONSTANT i32 0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
+
+---
+name: test_build_vector_trunc_s_v2s16_s_s32_zero
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_s_s32_zero
+ ; GFX9: liveins: $sgpr0
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]]
+ ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_CONSTANT i32 0
+ %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: test_build_vector_trunc_lshr16_zero
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX9-LABEL: name: test_build_vector_trunc_lshr16_zero
+ ; GFX9: liveins: $sgpr0
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], 16, implicit-def $scc
+ ; GFX9: S_ENDPGM 0, implicit [[S_LSHR_B32_]]
+ %0:sgpr(s32) = G_CONSTANT i32 0
+ %1:sgpr(s32) = COPY $sgpr0
+
+ %2:sgpr(s32) = G_CONSTANT i32 16
+ %3:sgpr(s32) = G_LSHR %1, %2
+
+ %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %0
+ S_ENDPGM 0, implicit %4
+...
More information about the llvm-commits
mailing list