[llvm] [AMDGPU] Canonicalize G_ZEXT of the shift amount in RegBankCombiner (PR #131792)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 02:48:26 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/131792
>From 79951fae5d5d71149523d2539bbfb036a9b941f9 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 17 Mar 2025 15:29:37 +0100
Subject: [PATCH 1/2] [AMDGPU] Canonicalize G_ZEXT of the shift amount in
RegBankCombiner
Canonicalize it to a G_AND instead so that ISel patterns can pick it
up and ignore it, as the shift instructions only read low bits.
G_ZEXT would be lowered to a v/s_and anyway in most cases.
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 18 ++-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 30 ++++
.../GlobalISel/combine-shift-amount-zext.mir | 146 ++++++++++++++++++
3 files changed, 193 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a21505356274b..a0eb0ffea8d7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -134,6 +134,22 @@ def combine_fmul_with_select_to_fldexp : GICombineRule<
[{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+// (shift x, (zext amt)) -> (shift x, (and (anyext amt), mask)
+//
+// The pattern is longer, but is better for matching during ISel.
+class canonicalize_zext_shift_amt<Instruction opc> : GICombineRule<
+ (defs root:$dst),
+ (match (G_ZEXT $amt, $amtsrc):$zext,
+ (opc $dst, $src, $amt):$shift),
+ (apply [{ applyCanonicalizeZextShiftAmt(*${shift}, *${zext}); }])>;
+
+def canonicalize_zext_lshr : canonicalize_zext_shift_amt<G_LSHR>;
+def canonicalize_zext_ashr : canonicalize_zext_shift_amt<G_ASHR>;
+def canonicalize_zext_shl : canonicalize_zext_shift_amt<G_SHL>;
+
+def zext_of_shift_amount_combines : GICombineGroup<[
+ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
+]>;
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
@@ -181,5 +197,5 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines]> {
+ cast_of_cast_combines, zext_of_shift_amount_combines]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 98c48f4fe3705..68bffe5bbb7f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -87,6 +87,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -362,6 +364,34 @@ void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}
+void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
+ MachineInstr &MI, MachineInstr &Ext) const {
+ unsigned ShOpc = MI.getOpcode();
+ assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR ||
+ ShOpc == AMDGPU::G_ASHR);
+ assert(Ext.getOpcode() == AMDGPU::G_ZEXT);
+
+ Register AmtReg = Ext.getOperand(1).getReg();
+ Register ShDst = MI.getOperand(0).getReg();
+ Register ShSrc = MI.getOperand(1).getReg();
+
+ LLT ExtAmtTy = MRI.getType(Ext.getOperand(0).getReg());
+ LLT AmtTy = MRI.getType(AmtReg);
+
+ auto &RB = *MRI.getRegBank(AmtReg);
+
+ auto NewExt = B.buildAnyExt(ExtAmtTy, AmtReg);
+ auto Mask = B.buildConstant(
+ ExtAmtTy, maskTrailingOnes<uint64_t>(AmtTy.getScalarSizeInBits()));
+ auto And = B.buildAnd(ExtAmtTy, NewExt, Mask);
+ B.buildInstr(ShOpc, {ShDst}, {ShSrc, And});
+
+ MRI.setRegBank(NewExt.getReg(0), RB);
+ MRI.setRegBank(Mask.getReg(0), RB);
+ MRI.setRegBank(And.getReg(0), RB);
+ MI.eraseFromParent();
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
new file mode 100644
index 0000000000000..77d30f6fa5223
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
@@ -0,0 +1,146 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
+
+---
+name: lshr_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: lshr_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_LSHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: ashr_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: ashr_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_ASHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: shl_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: shl_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_SHL %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: lshr_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: lshr_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_LSHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: ashr_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: ashr_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_ASHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: shl_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: shl_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_SHL %src, %zextamt
+ $sgpr0 = COPY %res
+...
>From a71242df8f384cf87a00754707a3657694bec09c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 26 Mar 2025 10:39:01 +0100
Subject: [PATCH 2/2] update tests
---
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 32 +---------
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 66 +++++++--------------
2 files changed, 23 insertions(+), 75 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..18019fa31410e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3329,9 +3329,7 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
; GFX6-NEXT: v_and_b32_e32 v3, 15, v2
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3486,10 +3484,8 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
; GFX6-NEXT: v_and_b32_e32 v1, 15, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1
; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: ; return to shader part epilog
@@ -3793,20 +3789,16 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -3942,18 +3934,14 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
@@ -4450,28 +4438,22 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX6-NEXT: v_and_b32_e32 v9, 15, v6
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_and_b32_e32 v3, 15, v7
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -4790,37 +4772,29 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_and_b32_e32 v12, 15, v8
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v9
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v10
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10
; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v11
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11
; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 2e8c918e4c67e..d441a9a62fc1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3077,11 +3077,9 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -3235,9 +3233,7 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3570,26 +3566,22 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -3735,32 +3727,28 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001
-; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
; GFX6-NEXT: s_lshr_b32 s4, s4, 14
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: s_or_b32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s2, s2, 1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
-; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshl_b32 s1, s1, 1
; GFX6-NEXT: s_lshr_b32 s4, s4, 14
; GFX6-NEXT: s_lshl_b32 s3, s3, 1
-; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
+; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: s_or_b32 s1, s1, s4
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
@@ -4358,26 +4346,22 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX6-NEXT: v_or_b32_e32 v1, v1, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GFX6-NEXT: v_and_b32_e32 v9, 15, v6
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
@@ -4388,9 +4372,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX6-NEXT: v_and_b32_e32 v5, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -4782,26 +4764,22 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
; GFX6-NEXT: v_or_b32_e32 v1, v1, v10
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX6-NEXT: v_and_b32_e32 v11, 15, v8
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v10
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
@@ -4818,20 +4796,16 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_and_b32_e32 v8, 15, v6
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v7
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list