[llvm] [GlobalISel] Combine G_MERGE_VALUES of x and undef (PR #113616)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 25 00:03:23 PDT 2024
https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/113616
>From cb5b8dd1bf867b25b8873a6d5274f93fa8d610b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Thu, 24 Oct 2024 10:31:28 +0200
Subject: [PATCH 1/4] [GlobalISel] Combine G_MERGE_VALUES of x and undef
into zext x
; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[DEF]](s32)
Please continue padding merge values.
// %bits_8_15:_(s8) = G_IMPLICIT_DEF
// %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
%bits_8_15 is defined by undef. Its value is undefined and we can pick
an arbitrary value. For optimization, we pick zero.
// %0:_(s16) = G_ZEXT %bits_0_7:(s8)
The upper bits of %0 are zero and the lower bits come from %bits_0_7.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +
.../include/llvm/Target/GlobalISel/Combine.td | 11 +-
llvm/lib/CodeGen/GlobalISel/CMakeLists.txt | 1 +
.../GlobalISel/CombinerHelperArtifacts.cpp | 57 ++++++
.../AArch64/GlobalISel/combine-unmerge.mir | 50 ++++-
llvm/test/CodeGen/AArch64/bswap.ll | 27 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 +++++----
.../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++--
llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 177 ++++++++++--------
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +-
11 files changed, 295 insertions(+), 181 deletions(-)
create mode 100644 llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..dbd9d6d553b24b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,9 @@ class CombinerHelper {
bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
BuildFnTy &MatchInfo);
+ // merge_values(_, undef) -> zext
+ bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..6c84d6ad40471c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -848,6 +848,14 @@ def unmerge_zext_to_zext : GICombineRule<
(apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
>;
+/// Transform merge_x_undef -> zext.
+def merge_of_x_and_undef : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_IMPLICIT_DEF $undef),
+ (G_MERGE_VALUES $root, $x, $undef):$MI,
+ [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
unmerge_cst,
unmerge_undef,
unmerge_dead_to_trunc,
- unmerge_zext_to_zext
+ unmerge_zext_to_zext,
+ merge_of_x_and_undef
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
+ CombinerHelperArtifacts.cpp
CombinerHelperCasts.cpp
CombinerHelperCompares.cpp
CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..29875b04c37984
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,57 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_MERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ Register Undef = Merge->getSourceReg(1);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ //
+ // %bits_8_15:_(s8) = G_IMPLICIT_DEF
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!MRI.hasOneNonDBGUse(Undef) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..67cbdd19a05684 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -10,9 +10,9 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -115,9 +115,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -136,9 +138,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64)
; CHECK-NEXT: $h0 = COPY [[UV]](s16)
; CHECK-NEXT: $h1 = COPY [[UV1]](s16)
; CHECK-NEXT: $h2 = COPY [[UV2]](s16)
@@ -539,3 +540,36 @@ body: |
$q0 = COPY %un1(s128)
$q1 = COPY %un2(s128)
...
+
+# Check that we zext the merge
+---
+name: test_merge_undef
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+
+# Check that we don't zext the merge, multi-use
+---
+name: test_merge_undef_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..afc1d932840ff7 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,25 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w0
-; CHECK-SD-NEXT: mov x0, xzr
-; CHECK-SD-NEXT: rev w8, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: lsl x1, x8, #48
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, w0
-; CHECK-GI-NEXT: mov x0, xzr
-; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: lsr w8, w8, #16
-; CHECK-GI-NEXT: bfi x8, x8, #32, #32
-; CHECK-GI-NEXT: and x8, x8, #0xffff
-; CHECK-GI-NEXT: lsl x1, x8, #48
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: lsl x1, x8, #48
+; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..e86282fa1883d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_ashr_i32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_ashr_i32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..2ae9d28cda16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_lshr_b32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..ac6660b76ded98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1555,29 +1559,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
; GCN-LABEL: s_sext_inreg_i65_18:
; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT: s_lshr_b32 s4, s1, 14
-; GCN-NEXT: s_mov_b32 s5, 0
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GCN-NEXT: s_lshr_b32 s2, s1, 14
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT: s_lshl_b32 s7, s2, 14
-; GCN-NEXT: s_mov_b32 s6, s5
+; GCN-NEXT: s_lshl_b32 s7, s4, 14
+; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT: s_mov_b32 s5, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14
; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT: s_mov_b32 s6, s5
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14
-; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s6, s3
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14
+; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..10ecb99731f796 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1580,90 +1580,99 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_shl_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4
-; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3
-; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3
-; GFX6-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, v3
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 64, v4
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
+; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], v5
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v4
+; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, 64, v4
+; GFX6-NEXT: v_lshl_b64 v[7:8], v[0:1], v4
+; GFX6-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX6-NEXT: v_lshl_b64 v[5:6], v[0:1], v9
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 64, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, 64, v4
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_shl_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_sub_u32_e32 v5, 64, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v9, 64, v4
+; GFX9-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_shl_i65:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX10-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_shl_i65:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX11-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v7 :: v_dual_cndmask_b32 v3, v9, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = shl i65 %value, %amount
ret i65 %result
@@ -1720,20 +1729,22 @@ define i65 @v_shl_i65_33(i65 %value) {
define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_shl_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s6, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s5, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
+; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
-; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3
-; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3
-; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4
+; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT: s_cselect_b32 s3, s6, s8
+; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GCN-NEXT: s_cselect_b32 s3, s4, s8
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b32 s2, s2, s3
; GCN-NEXT: ; return to shader part epilog
@@ -1741,19 +1752,21 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GFX10PLUS-LABEL: s_shl_i65:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3
; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3
-; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
+; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 786fe03164690e..b75711590f3793 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1611,8 +1611,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
@@ -1963,8 +1963,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
>From c52dcfe5758edfdb5a0b90bd06e4c9a914005eb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 25 Oct 2024 06:38:17 +0200
Subject: [PATCH 2/4] address review comments
---
llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp | 8 +++++---
llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir | 2 +-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 29875b04c37984..3c6bcd9cc144f5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -33,10 +33,13 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
const GMerge *Merge = cast<GMerge>(&MI);
Register Dst = Merge->getReg(0);
- Register Undef = Merge->getSourceReg(1);
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+ // Otherwise, we would miscompile.
+ if (Merge->getNumSources() > 2)
+ return false;
+
//
// %bits_8_15:_(s8) = G_IMPLICIT_DEF
// %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
@@ -46,8 +49,7 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
// %0:_(s16) = G_ZEXT %bits_0_7:(s8)
//
- if (!MRI.hasOneNonDBGUse(Undef) ||
- !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
return false;
MatchInfo = [=](MachineIRBuilder &B) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 67cbdd19a05684..6a62e01029c1c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -564,7 +564,7 @@ body: |
; CHECK-LABEL: name: test_merge_undef_multi_use
; CHECK: %opaque:_(s64) = COPY $x0
; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
; CHECK-NEXT: $q0 = COPY %me(s128)
; CHECK-NEXT: $x0 = COPY %def(s64)
%opaque:_(s64) = COPY $x0
>From 34d5ea5c28ce964f29e64041d0ac99a339479853 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 25 Oct 2024 08:10:47 +0200
Subject: [PATCH 3/4] address review cmments
---
.../GlobalISel/CombinerHelperArtifacts.cpp | 9 +-
.../AArch64/GlobalISel/combine-unmerge.mir | 26 +--
llvm/test/CodeGen/AArch64/bswap.ll | 26 ++-
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 ++++-----
.../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++--
llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 177 ++++++++----------
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +-
8 files changed, 185 insertions(+), 203 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 3c6bcd9cc144f5..8f4095f01be7a3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -37,8 +37,7 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
// Otherwise, we would miscompile.
- if (Merge->getNumSources() > 2)
- return false;
+ assert(Merge->getNumSources() == 2 && "Unexpected number of operands");
//
// %bits_8_15:_(s8) = G_IMPLICIT_DEF
@@ -46,14 +45,14 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
//
// ->
//
- // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ // %0:_(s16) = G_ANYEXT %bits_0_7:(s8)
//
- if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ANYEXT, {DstTy, SrcTy}}))
return false;
MatchInfo = [=](MachineIRBuilder &B) {
- B.buildZExt(Dst, Merge->getSourceReg(0));
+ B.buildAnyExt(Dst, Merge->getSourceReg(0));
};
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 6a62e01029c1c8..4e9adf847260bd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -10,9 +10,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[C]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -115,11 +114,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
- ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -137,13 +133,11 @@ name: test_combine_unmerge_merge_incompatible_types
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
- ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64)
- ; CHECK-NEXT: $h0 = COPY [[UV]](s16)
- ; CHECK-NEXT: $h1 = COPY [[UV1]](s16)
- ; CHECK-NEXT: $h2 = COPY [[UV2]](s16)
- ; CHECK-NEXT: $h3 = COPY [[UV3]](s16)
+ ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: $h0 = COPY [[DEF]](s16)
+ ; CHECK-NEXT: $h1 = COPY [[DEF]](s16)
+ ; CHECK-NEXT: $h2 = COPY [[DEF]](s16)
+ ; CHECK-NEXT: $h3 = COPY [[DEF]](s16)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -548,7 +542,7 @@ body: |
bb.1:
; CHECK-LABEL: name: test_merge_undef
; CHECK: %opaque:_(s64) = COPY $x0
- ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64)
; CHECK-NEXT: $q0 = COPY %me(s128)
%opaque:_(s64) = COPY $x0
%def:_(s64) = G_IMPLICIT_DEF
@@ -564,7 +558,7 @@ body: |
; CHECK-LABEL: name: test_merge_undef_multi_use
; CHECK: %opaque:_(s64) = COPY $x0
; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64)
; CHECK-NEXT: $q0 = COPY %me(s128)
; CHECK-NEXT: $x0 = COPY %def(s64)
%opaque:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index afc1d932840ff7..e86f55d63f754b 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,14 +45,24 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-LABEL: bswap_i16_to_i128_anyext:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: mov x0, xzr
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: lsl x1, x8, #48
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, w0
+; CHECK-SD-NEXT: mov x0, xzr
+; CHECK-SD-NEXT: rev w8, w8
+; CHECK-SD-NEXT: lsr w8, w8, #16
+; CHECK-SD-NEXT: lsl x1, x8, #48
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x0, xzr
+; CHECK-GI-NEXT: rev w8, w8
+; CHECK-GI-NEXT: lsr w8, w8, #16
+; CHECK-GI-NEXT: and x8, x8, #0xffff
+; CHECK-GI-NEXT: lsl x1, x8, #48
+; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index e86282fa1883d9..63f5464371cc62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s2, s1, 1
-; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT: s_ashr_i32 s2, s5, 1
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s0, s1, 1
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT: s_ashr_i32 s2, s3, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_mov_b32 s3, 0
-; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, 0
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 2ae9d28cda16a2..5dd4fa0809131f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, 0
; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, 0
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,22 +1749,20 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s4, s3
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s4, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s4
-; GCN-NEXT: s_cmp_lt_u32 s4, 64
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s3, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s3
+; GCN-NEXT: s_cmp_lt_u32 s3, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s3, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
-; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
+; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
+; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
+; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1773,26 +1771,24 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_mov_b32 s4, s3
-; GFX10PLUS-NEXT: s_mov_b32 s3, 0
-; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
+; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1801,22 +1797,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s2, s1, 1
-; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT: s_lshr_b32 s2, s5, 1
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s0, s1, 1
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_mov_b32 s3, 0
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, 0
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index ac6660b76ded98..bac80f0777c024 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,7 +1440,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1456,7 +1455,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1472,7 +1470,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1487,7 +1484,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1559,29 +1555,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
; GCN-LABEL: s_sext_inreg_i65_18:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s3, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
-; GCN-NEXT: s_lshr_b32 s2, s1, 14
-; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
-; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
+; GCN-NEXT: s_lshr_b32 s4, s1, 14
+; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT: s_lshl_b32 s7, s4, 14
-; GCN-NEXT: s_mov_b32 s6, s3
+; GCN-NEXT: s_lshl_b32 s7, s2, 14
+; GCN-NEXT: s_mov_b32 s6, s5
; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
+; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_mov_b32 s3, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
+; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14
+; GFX10PLUS-NEXT: s_mov_b32 s5, 0
; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
-; GFX10PLUS-NEXT: s_mov_b32 s6, s3
-; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14
-; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
+; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s6, s5
+; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 10ecb99731f796..4cf1c92539c36f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1580,99 +1580,90 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_shl_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 64, v4
-; GFX6-NEXT: v_mov_b32_e32 v3, 0
-; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], v5
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v4
-; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, 64, v4
-; GFX6-NEXT: v_lshl_b64 v[7:8], v[0:1], v4
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v6
-; GFX6-NEXT: v_lshl_b64 v[5:6], v[0:1], v9
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4
+; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3
+; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3
+; GFX6-NEXT: v_or_b32_e32 v9, v4, v5
+; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 64, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, 64, v4
-; GFX8-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v6
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
+; GFX8-NEXT: v_or_b32_e32 v9, v4, v5
+; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_shl_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: v_sub_u32_e32 v5, 64, v4
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v9, 64, v4
-; GFX9-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v3, v5, v6
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v9, v4, v5
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_shl_i65:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_sub_nc_u32_e32 v7, 64, v4
-; GFX10-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
-; GFX10-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
-; GFX10-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v1, v6, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3
+; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
+; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
+; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
+; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_shl_i65:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, 0
-; GFX11-NEXT: v_sub_nc_u32_e32 v7, 64, v4
-; GFX11-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
-; GFX11-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
-; GFX11-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v1, v6, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v7 :: v_dual_cndmask_b32 v3, v9, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
+; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
+; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
+; GFX11-NEXT: v_or_b32_e32 v1, v5, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = shl i65 %value, %amount
ret i65 %result
@@ -1729,22 +1720,20 @@ define i65 @v_shl_i65_33(i65 %value) {
define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_shl_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 s4, s3
-; GCN-NEXT: s_sub_i32 s10, s4, 64
-; GCN-NEXT: s_sub_i32 s5, 64, s4
-; GCN-NEXT: s_cmp_lt_u32 s4, 64
-; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_sub_i32 s10, s3, 64
+; GCN-NEXT: s_sub_i32 s6, 64, s3
+; GCN-NEXT: s_cmp_lt_u32 s3, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s3, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
-; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4
-; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3
+; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3
+; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
-; GCN-NEXT: s_cselect_b32 s3, s4, s8
+; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GCN-NEXT: s_cselect_b32 s3, s6, s8
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b32 s2, s2, s3
; GCN-NEXT: ; return to shader part epilog
@@ -1752,21 +1741,19 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GFX10PLUS-LABEL: s_shl_i65:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3
+; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3
; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT: s_mov_b32 s4, s3
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT: s_mov_b32 s3, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3
+; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
-; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index b75711590f3793..786fe03164690e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1611,8 +1611,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_mov_b32_e32 v6, 0
; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GISEL-NEXT: v_mov_b32_e32 v6, 0
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
@@ -1963,8 +1963,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_mov_b32_e32 v6, 0
; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GISEL-NEXT: v_mov_b32_e32 v6, 0
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
>From 62258b0339d0b54a93c48961d0d671439412365d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 25 Oct 2024 09:02:54 +0200
Subject: [PATCH 4/4] back to zext
---
.../include/llvm/Target/GlobalISel/Combine.td | 2 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 7 +-
.../GlobalISel/CombinerHelperArtifacts.cpp | 6 +-
.../AArch64/GlobalISel/combine-unmerge.mir | 28 +--
llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 13 +-
llvm/test/CodeGen/AArch64/bswap.ll | 26 +--
.../CodeGen/AArch64/extract-vector-elt.ll | 15 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 24 +--
.../combine-amdgpu-cvt-f32-ubyte.mir | 6 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 86 +++++----
.../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 36 ++--
llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 177 ++++++++++--------
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 4 +-
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 7 +-
14 files changed, 220 insertions(+), 217 deletions(-)
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 6c84d6ad40471c..a68310764767a8 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -412,7 +412,7 @@ def binop_right_undef_to_undef: GICombineRule<
def unary_undef_to_zero: GICombineRule<
(defs root:$root),
- (match (wip_match_opcode G_ABS):$root,
+ (match (wip_match_opcode G_ABS, G_ZEXT):$root,
[{ return Helper.matchOperandIsUndef(*${root}, 1); }]),
(apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b7ddf9f479ef8e..e3ba396b0adc31 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2916,8 +2916,11 @@ void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) {
assert(MI.getNumDefs() == 1 && "Expected only one def?");
- Builder.buildConstant(MI.getOperand(0), C);
- MI.eraseFromParent();
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}})) {
+ Builder.buildConstant(MI.getOperand(0), C);
+ MI.eraseFromParent();
+ }
}
void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) {
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 8f4095f01be7a3..047e411eb76cfe 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -45,14 +45,14 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
//
// ->
//
- // %0:_(s16) = G_ANYEXT %bits_0_7:(s8)
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
//
- if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ANYEXT, {DstTy, SrcTy}}))
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
return false;
MatchInfo = [=](MachineIRBuilder &B) {
- B.buildAnyExt(Dst, Merge->getSourceReg(0));
+ B.buildZExt(Dst, Merge->getSourceReg(0));
};
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 4e9adf847260bd..e4e7f315397ff8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -9,9 +9,9 @@ name: test_combine_unmerge_merge
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
- ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -113,9 +113,11 @@ name: test_combine_unmerge_bitcast_merge
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
- ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[C]](s64)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -133,11 +135,11 @@ name: test_combine_unmerge_merge_incompatible_types
body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
- ; CHECK: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $h0 = COPY [[DEF]](s16)
- ; CHECK-NEXT: $h1 = COPY [[DEF]](s16)
- ; CHECK-NEXT: $h2 = COPY [[DEF]](s16)
- ; CHECK-NEXT: $h3 = COPY [[DEF]](s16)
+ ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+ ; CHECK-NEXT: $h0 = COPY [[C]](s16)
+ ; CHECK-NEXT: $h1 = COPY [[C]](s16)
+ ; CHECK-NEXT: $h2 = COPY [[C]](s16)
+ ; CHECK-NEXT: $h3 = COPY [[C]](s16)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -542,7 +544,7 @@ body: |
bb.1:
; CHECK-LABEL: name: test_merge_undef
; CHECK: %opaque:_(s64) = COPY $x0
- ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64)
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
; CHECK-NEXT: $q0 = COPY %me(s128)
%opaque:_(s64) = COPY $x0
%def:_(s64) = G_IMPLICIT_DEF
@@ -558,7 +560,7 @@ body: |
; CHECK-LABEL: name: test_merge_undef_multi_use
; CHECK: %opaque:_(s64) = COPY $x0
; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %me:_(s128) = G_ANYEXT %opaque(s64)
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
; CHECK-NEXT: $q0 = COPY %me(s128)
; CHECK-NEXT: $x0 = COPY %def(s64)
%opaque:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index a39c2b5d14dddd..98c1a1bef569ab 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -322,17 +322,18 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
;
; CHECK-GI-LABEL: typei1_orig:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr q1, [x2]
+; CHECK-GI-NEXT: ldr q0, [x2]
; CHECK-GI-NEXT: cmp x0, #0
-; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff
; CHECK-GI-NEXT: cset w8, gt
-; CHECK-GI-NEXT: neg v1.8h, v1.8h
-; CHECK-GI-NEXT: dup v2.8h, w8
+; CHECK-GI-NEXT: neg v0.8h, v0.8h
+; CHECK-GI-NEXT: dup v1.8h, w8
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: mul v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: cmeq v0.8h, v0.8h, #0
; CHECK-GI-NEXT: mvn v0.16b, v0.16b
-; CHECK-GI-NEXT: mul v1.8h, v1.8h, v2.8h
; CHECK-GI-NEXT: cmeq v1.8h, v1.8h, #0
; CHECK-GI-NEXT: mvn v1.16b, v1.16b
-; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: shl v0.16b, v0.16b, #7
; CHECK-GI-NEXT: sshr v0.16b, v0.16b, #7
; CHECK-GI-NEXT: str q0, [x1]
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index e86f55d63f754b..afc1d932840ff7 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,24 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w0
-; CHECK-SD-NEXT: mov x0, xzr
-; CHECK-SD-NEXT: rev w8, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: lsl x1, x8, #48
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, w0
-; CHECK-GI-NEXT: mov x0, xzr
-; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: lsr w8, w8, #16
-; CHECK-GI-NEXT: and x8, x8, #0xffff
-; CHECK-GI-NEXT: lsl x1, x8, #48
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: lsl x1, x8, #48
+; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 0481d997d24faf..d5b7c63a80053a 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -8,17 +8,10 @@
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const
define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) {
-; CHECK-SD-LABEL: extract_v2i64_undef_index:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmov x0, d0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: extract_v2i64_undef_index:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str q0, [sp, #-16]!
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT: ldr x0, [sp], #16
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: extract_v2i64_undef_index:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
entry:
%d = extractelement <2 x i64> %a, i32 undef
ret i64 %d
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..e86282fa1883d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_ashr_i32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_ashr_i32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
index 7893bfa1d38f08..9b39afd32ac378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
@@ -261,8 +261,7 @@ body: |
; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_16
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16)
+ ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext
; CHECK-NEXT: $vgpr0 = COPY %result(s32)
%arg:_(s32) = COPY $vgpr0
@@ -284,8 +283,7 @@ body: |
; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_24
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16)
+ ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext
; CHECK-NEXT: $vgpr0 = COPY %result(s32)
%arg:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..2ae9d28cda16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_lshr_b32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..ac6660b76ded98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1555,29 +1559,29 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
; GCN-LABEL: s_sext_inreg_i65_18:
; GCN: ; %bb.0:
-; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT: s_lshr_b32 s4, s1, 14
-; GCN-NEXT: s_mov_b32 s5, 0
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GCN-NEXT: s_lshr_b32 s2, s1, 14
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT: s_lshl_b32 s7, s2, 14
-; GCN-NEXT: s_mov_b32 s6, s5
+; GCN-NEXT: s_lshl_b32 s7, s4, 14
+; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT: s_mov_b32 s5, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 18
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 14
; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT: s_mov_b32 s6, s5
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14
-; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18
+; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s6, s3
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT: s_lshl_b32 s7, s4, 14
+; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], 18
; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX10PLUS-NEXT: ; return to shader part epilog
%shl = shl i65 %value, 18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..10ecb99731f796 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1580,90 +1580,99 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_shl_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4
-; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3
-; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3
-; GFX6-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, v3
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 64, v4
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
+; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], v5
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v4
+; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, 64, v4
+; GFX6-NEXT: v_lshl_b64 v[7:8], v[0:1], v4
+; GFX6-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX6-NEXT: v_lshl_b64 v[5:6], v[0:1], v9
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 64, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, 64, v4
+; GFX8-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_shl_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v9, v4, v5
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_sub_u32_e32 v5, 64, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v9, 64, v4
+; GFX9-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX9-NEXT: v_or_b32_e32 v3, v5, v6
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v9, v[0:1]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_shl_i65:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX10-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_shl_i65:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT: v_sub_nc_u32_e32 v7, 64, v4
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v4, v[2:3]
+; GFX11-NEXT: v_subrev_nc_u32_e32 v9, 64, v4
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v4
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v7, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, v[0:1]
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v7 :: v_dual_cndmask_b32 v3, v9, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = shl i65 %value, %amount
ret i65 %result
@@ -1720,20 +1729,22 @@ define i65 @v_shl_i65_33(i65 %value) {
define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_shl_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s6, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s5, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
+; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
-; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3
-; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3
-; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
+; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s4
+; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT: s_cselect_b32 s3, s6, s8
+; GCN-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GCN-NEXT: s_cselect_b32 s3, s4, s8
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b32 s2, s2, s3
; GCN-NEXT: ; return to shader part epilog
@@ -1741,19 +1752,21 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
; GFX10PLUS-LABEL: s_shl_i65:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s3
; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3
-; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s5
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
+; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT: s_cselect_b32 s3, s4, s6
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, s6, s8
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 786fe03164690e..b75711590f3793 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1611,8 +1611,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
@@ -1963,8 +1963,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL: ; %bb.0: ; %fp-to-i-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_mov_b32_e32 v6, 0
+; GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f
; GISEL-NEXT: s_mov_b64 s[4:5], 0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 6ac04d8bc42bba..253377ff47fe74 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -3962,14 +3962,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_not_b32_e32 v2, 31
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -4079,15 +4077,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3
-; VI-GISEL-NEXT: v_or_b32_e32 v2, s0, v2
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
More information about the llvm-commits
mailing list