[llvm-branch-commits] [llvm] edaf6a0 - [AMDGPU][GISel] Combine G_INSERT_VECTOR_ELT to G_SHUFFLE_VECTOR
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Oct 19 04:03:00 PDT 2022
Author: Pierre van Houtryve
Date: 2022-10-19T10:16:08Z
New Revision: edaf6a07a4aafd963ea958703890d03ab58ff2dd
URL: https://github.com/llvm/llvm-project/commit/edaf6a07a4aafd963ea958703890d03ab58ff2dd
DIFF: https://github.com/llvm/llvm-project/commit/edaf6a07a4aafd963ea958703890d03ab58ff2dd.diff
LOG: [AMDGPU][GISel] Combine G_INSERT_VECTOR_ELT to G_SHUFFLE_VECTOR
Depends on D134967
Differential Revision: https://reviews.llvm.org/D135145
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 2415fdfecaae2..8b2ff164d3365 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -45,6 +45,12 @@ def cvt_f32_ubyteN : GICombineRule<
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+def insert_vec_elt_to_shuffle : GICombineRule<
+ (defs root:$insertelt, unsigned_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_INSERT_VECTOR_ELT):$insertelt,
+ [{ return PreLegalizerHelper.matchInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }]),
+ (apply [{ PreLegalizerHelper.applyInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }])>;
+
def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
def clamp_i64_to_i16 : GICombineRule<
@@ -109,7 +115,7 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPreLegalizerCombinerHelper",
- [all_combines, clamp_i64_to_i16, foldable_fneg]> {
+ [all_combines, clamp_i64_to_i16, foldable_fneg, insert_vec_elt_to_shuffle]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 6d6c69adaa658..08eefc6da4d31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -55,6 +55,9 @@ class AMDGPUPreLegalizerCombinerHelper {
void applyClampI64ToI16(MachineInstr &MI,
const ClampI64ToI16MatchInfo &MatchInfo);
+
+ bool matchInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx);
+ void applyInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx);
};
bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
@@ -154,6 +157,73 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
MI.eraseFromParent();
}
+bool AMDGPUPreLegalizerCombinerHelper::matchInsertVectorEltToShuffle(
+ MachineInstr &MI, unsigned &Idx) {
+ // Transfroms a G_INSERT_VECTOR_ELT into an equivalent G_SHUFFLE_MASK if:
+ // - Scalar Pack insts are present (for <32 bits element types)
+ // - The vector has <= 4 elements.
+ // as this is a preferred canonical form of the operation.
+ //
+ // Note that both restrictions are arbitrary. Currently, it's mostly targeted
+ // towards 2x16 vectors. Restrictions could be relaxed or entirely removed in
+ // the future if codegen can handle it without causing regressions.
+
+ LLT VecTy = MRI.getType(MI.getOperand(0).getReg());
+ const unsigned EltSize = VecTy.getElementType().getSizeInBits();
+ if (EltSize < 32 &&
+ !MI.getMF()->getSubtarget<GCNSubtarget>().hasScalarPackInsts())
+ return false;
+
+ if (VecTy.isScalable() || VecTy.getNumElements() > 4)
+ return false;
+
+ Optional<ValueAndVReg> MaybeIdxVal =
+ getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+ if (!MaybeIdxVal)
+ return false;
+
+ Idx = MaybeIdxVal->Value.getZExtValue();
+ return true;
+}
+
+void AMDGPUPreLegalizerCombinerHelper::applyInsertVectorEltToShuffle(
+ MachineInstr &MI, unsigned &Idx) {
+ B.setInstrAndDebugLoc(MI);
+
+ Register Ins = MI.getOperand(2).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+
+ LLT VecTy = MRI.getType(Dst);
+ LLT EltTy = VecTy.getElementType();
+ const unsigned NumElts = VecTy.getNumElements();
+
+ const auto Undef = MRI.createGenericVirtualRegister(EltTy);
+ B.buildUndef(Undef);
+
+ const auto OtherVec = MRI.createGenericVirtualRegister(VecTy);
+
+ SmallVector<Register, 4> Srcs;
+ Srcs.push_back(Ins);
+ for (unsigned K = 1; K < NumElts; ++K)
+ Srcs.push_back(Undef);
+
+ B.buildBuildVector(OtherVec, Srcs);
+
+ // NumElts == Ins in OtherVec
+ // 0...(NumElts-1) = Original elements
+ SmallVector<int, 4> ShuffleMask;
+ for (unsigned CurIdx = 0; CurIdx < NumElts; ++CurIdx) {
+ if (CurIdx == Idx)
+ ShuffleMask.push_back(NumElts);
+ else
+ ShuffleMask.push_back(CurIdx);
+ }
+
+ B.buildShuffleVector(Dst, Vec, OtherVec, ShuffleMask);
+ Helper.eraseInst(MI);
+}
+
class AMDGPUPreLegalizerCombinerHelperState {
protected:
AMDGPUCombinerHelper &Helper;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir
new file mode 100644
index 0000000000000..2fdda596d2c05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir
@@ -0,0 +1,169 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GFX9PLUS
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,VI
+
+---
+name: test_v2s16_idx0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9PLUS-LABEL: name: test_v2s16_idx0
+ ; GFX9PLUS: liveins: $vgpr0
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(2, 1)
+ ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>)
+ ; VI-LABEL: name: test_v2s16_idx0
+ ; VI: liveins: $vgpr0
+ ; VI-NEXT: {{ $}}
+ ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0
+ ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 0
+ ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32)
+ ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>)
+ %src:_(<2 x s16>) = COPY $vgpr0
+ %idx:_(s32) = G_CONSTANT i32 0
+ %elt:_(s16) = G_CONSTANT i16 42
+ %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx
+ $vgpr0 = COPY %ins
+...
+
+---
+name: test_v2s16_idx1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9PLUS-LABEL: name: test_v2s16_idx1
+ ; GFX9PLUS: liveins: $vgpr0
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(0, 2)
+ ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>)
+ ; VI-LABEL: name: test_v2s16_idx1
+ ; VI: liveins: $vgpr0
+ ; VI-NEXT: {{ $}}
+ ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0
+ ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 1
+ ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32)
+ ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>)
+ %src:_(<2 x s16>) = COPY $vgpr0
+ %idx:_(s32) = G_CONSTANT i32 1
+ %elt:_(s16) = G_CONSTANT i16 42
+ %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx
+ $vgpr0 = COPY %ins
+...
+
+---
+name: test_v2s16_idx2_nofold
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: test_v2s16_idx2_nofold
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %ins:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: $vgpr0 = COPY %ins(<2 x s16>)
+ %src:_(<2 x s16>) = COPY $vgpr0
+ %idx:_(s32) = G_CONSTANT i32 2
+ %elt:_(s16) = G_CONSTANT i16 42
+ %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx
+ $vgpr0 = COPY %ins
+...
+
+---
+name: test_v3s16_idx2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2
+ ; GFX9PLUS-LABEL: name: test_v3s16_idx2
+ ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX9PLUS-NEXT: %truncsrc:_(<3 x s16>) = G_TRUNC %src(<3 x s32>)
+ ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: %ins:_(<3 x s16>) = G_SHUFFLE_VECTOR %truncsrc(<3 x s16>), [[BUILD_VECTOR]], shufflemask(0, 1, 3)
+ ; GFX9PLUS-NEXT: %zextins:_(<3 x s32>) = G_ZEXT %ins(<3 x s16>)
+ ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY %zextins(<3 x s32>)
+ ; VI-LABEL: name: test_v3s16_idx2
+ ; VI: liveins: $vgpr0_vgpr1_vgpr2
+ ; VI-NEXT: {{ $}}
+ ; VI-NEXT: %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; VI-NEXT: %truncsrc:_(<3 x s16>) = G_TRUNC %src(<3 x s32>)
+ ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 2
+ ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; VI-NEXT: %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt(s16), %idx(s32)
+ ; VI-NEXT: %zextins:_(<3 x s32>) = G_ZEXT %ins(<3 x s16>)
+ ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY %zextins(<3 x s32>)
+ %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ %truncsrc:_(<3 x s16>) = G_TRUNC %src
+ %idx:_(s32) = G_CONSTANT i32 2
+ %elt:_(s16) = G_CONSTANT i16 42
+ %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt, %idx
+ %zextins:_(<3 x s32>) = G_ZEXT %ins
+ $vgpr0_vgpr1_vgpr2 = COPY %zextins
+...
+
+---
+name: test_v2s32_idx1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: test_v2s32_idx1
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: %elt:_(s32) = G_CONSTANT i32 42
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %elt(s32), [[DEF]](s32)
+ ; CHECK-NEXT: %ins:_(<2 x s32>) = G_SHUFFLE_VECTOR %src(<2 x s32>), [[BUILD_VECTOR]], shufflemask(0, 2)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %ins(<2 x s32>)
+ %src:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ %idx:_(s32) = G_CONSTANT i32 1
+ %elt:_(s32) = G_CONSTANT i32 42
+ %ins:_(<2 x s32>) = G_INSERT_VECTOR_ELT %src, %elt, %idx
+ $vgpr0_vgpr1 = COPY %ins
+...
+
+---
+name: test_v4s16_idx3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX9PLUS-LABEL: name: test_v4s16_idx3
+ ; GFX9PLUS: liveins: $vgpr0_vgpr1
+ ; GFX9PLUS-NEXT: {{ $}}
+ ; GFX9PLUS-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
+ ; GFX9PLUS-NEXT: %ins:_(<4 x s16>) = G_SHUFFLE_VECTOR %src(<4 x s16>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 4)
+ ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>)
+ ; VI-LABEL: name: test_v4s16_idx3
+ ; VI: liveins: $vgpr0_vgpr1
+ ; VI-NEXT: {{ $}}
+ ; VI-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 3
+ ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42
+ ; VI-NEXT: %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32)
+ ; VI-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>)
+ %src:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ %idx:_(s32) = G_CONSTANT i32 3
+ %elt:_(s16) = G_CONSTANT i16 42
+ %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx
+ $vgpr0_vgpr1 = COPY %ins
+...
More information about the llvm-branch-commits
mailing list