[llvm] [GlobalISel] Combine G_MERGE_VALUES of x and undef (PR #113616)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 24 13:28:32 PDT 2024
https://github.com/tschuett created https://github.com/llvm/llvm-project/pull/113616
into zext x
; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[DEF]](s32)
Please continue padding merge values.
// %bits_8_15:_(s8) = G_IMPLICIT_DEF
// %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
%bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick zero.
// %0:_(s16) = G_ZEXT %bits_0_7:(s8)
The upper bits of %0 are zero and the lower bits come from %bits_0_7.
>From cb5b8dd1bf867b25b8873a6d5274f93fa8d610b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Thu, 24 Oct 2024 10:31:28 +0200
Subject: [PATCH] [GlobalISel] Combine G_MERGE_VALUES of x and undef
into zext x
; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[DEF]](s32)
Please continue padding merge values.
// %bits_8_15:_(s8) = G_IMPLICIT_DEF
// %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
%bits_8_15 is defined by undef. Its value is undefined and we can pick
an arbitrary value. For optimization, we pick zero.
// %0:_(s16) = G_ZEXT %bits_0_7:(s8)
The upper bits of %0 are zero and the lower bits come from %bits_0_7.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +
.../include/llvm/Target/GlobalISel/Combine.td | 11 +-
llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 +
.../GlobalISel/CombinerHelperArtifacts.cpp | 57 ++++++
.../AArch64/GlobalISel/combine-unmerge.mir | 50 ++++-
llvm/test/CodeGen/AArch64/bswap.ll | 27 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 +++++----
.../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++--
llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 177 ++++++++++--------
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +-
11 files changed, 295 insertions(+), 181 deletions(-)
create mode 100644 llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..dbd9d6d553b24b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,9 @@ class CombinerHelper {
bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
BuildFnTy &MatchInfo);
+ // merge_values(_, undef) -> zext
+ bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..6c84d6ad40471c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -848,6 +848,14 @@ def unmerge_zext_to_zext : GICombineRule<
(apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
>;
+/// Transform merge_x_undef -> zext.
+def merge_of_x_and_undef : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_IMPLICIT_DEF $undef),
+ (G_MERGE_VALUES $root, $x, $undef):$MI,
+ [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
unmerge_cst,
unmerge_undef,
unmerge_dead_to_trunc,
- unmerge_zext_to_zext
+ unmerge_zext_to_zext,
+ merge_of_x_and_undef
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
+ CombinerHelperArtifacts.cpp
CombinerHelperCasts.cpp
CombinerHelperCompares.cpp
CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..29875b04c37984
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,57 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_MERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ Register Undef = Merge->getSourceReg(1);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ //
+ // %bits_8_15:_(s8) = G_IMPLICIT_DEF
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!MRI.hasOneNonDBGUse(Undef) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..67cbdd19a05684 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -10,9 +10,9 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -115,9 +115,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -136,9 +138,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64)
; CHECK-NEXT: $h0 = COPY [[UV]](s16)
; CHECK-NEXT: $h1 = COPY [[UV1]](s16)
; CHECK-NEXT: $h2 = COPY [[UV2]](s16)
@@ -539,3 +540,36 @@ body: |
$q0 = COPY %un1(s128)
$q1 = COPY %un2(s128)
...
+
+# Check that we zext the merge
+---
+name: test_merge_undef
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+
+# Check that we don't zext the merge, multi-use
+---
+name: test_merge_undef_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..afc1d932840ff7 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,25 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w0
-; CHECK-SD-NEXT: mov x0, xzr
-; CHECK-SD-NEXT: rev w8, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: lsl x1, x8, #48
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, w0
-; CHECK-GI-NEXT: mov x0, xzr
-; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: lsr w8, w8, #16
-; CHECK-GI-NEXT: bfi x8, x8, #32, #32
-; CHECK-GI-NEXT: and x8, x8, #0xffff
-; CHECK-GI-NEXT: lsl x1, x8, #48
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: lsl x1, x8, #48
+; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..e86282fa1883d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_ashr_i32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_ashr_i32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..2ae9d28cda16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_lshr_b32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..ac6660b76ded98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1555,29 +1559,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
; GCN-LABEL: s_sext_inreg_i65_18:
; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT: s_lshr_b32 s4, s1, 14
-; GCN-NEXT: s_mov_b32 s5, 0
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GCN-NEXT: s_lshr_b32 s2, s1, 14
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT: s_lshl_b32 s7, s2, 14
-; GCN-NEXT: s_mov_b32 s6, s5
+; GCN-NEXT: s_lshl_b32 s7, s4, 14
+; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT: s_mov_b32 s5, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14
; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT: s_mov_b32 s6, s5
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14
-; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s6, s3
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14
+; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..10ecb99731f796 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1580,90 +1580,99 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_shl_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4
-; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3
-; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3
-; GFX6-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, v3
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 64, v4
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
+; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], v5
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v4
+; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, 64, v4
+; GFX6-NEXT: v_lshl_b64 v[7:8], v[0:1], v4
+; GFX6-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX6-NEXT: v_lshl_b64 v[5:6], v[0:1], v9
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 64, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, 64, v4
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_shl_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_sub_u32_e32 v5, 64, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v9, 64, v4
+; GFX9-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_shl_i65:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX10-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_shl_i65:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX11-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v7 :: v_dual_cndmask_b32 v3, v9, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = shl i65 %value, %amount
ret i65 %result
@@ -1720,20 +1729,22 @@ define i65 @v_shl_i65_33(i65 %value) {
define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_shl_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s6, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s5, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
+; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
-; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3
-; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3
-; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4
+; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT: s_cselect_b32 s3, s6, s8
+; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GCN-NEXT: s_cselect_b32 s3, s4, s8
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b32 s2, s2, s3
; GCN-NEXT: ; return to shader part epilog
@@ -1741,19 +1752,21 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GFX10PLUS-LABEL: s_shl_i65:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3
; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3
-; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
+; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 786fe03164690e..b75711590f3793 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1611,8 +1611,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
@@ -1963,8 +1963,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
More information about the llvm-commits
mailing list