[llvm] d4de780 - [AMDGPU] Use "v_bfi_b32 x, 0, z" to implement (z & ~x) (#156636)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 3 04:00:07 PDT 2025
Author: Jay Foad
Date: 2025-09-03T11:00:03Z
New Revision: d4de7809697842e99e4935974d54d3a1f829e59d
URL: https://github.com/llvm/llvm-project/commit/d4de7809697842e99e4935974d54d3a1f829e59d
DIFF: https://github.com/llvm/llvm-project/commit/d4de7809697842e99e4935974d54d3a1f829e59d.diff
LOG: [AMDGPU] Use "v_bfi_b32 x, 0, z" to implement (z & ~x) (#156636)
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
llvm/test/CodeGen/AMDGPU/andorn2.ll
llvm/test/CodeGen/AMDGPU/anyext.ll
llvm/test/CodeGen/AMDGPU/bitop3.ll
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 46eab2a0a98c7..9cc9af7575db6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2480,6 +2480,22 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
+// (z & ~x)
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
+ (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+ (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+>;
+
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPatIgnoreCopies <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index e1ef3f9be0a5d..aa38c63dc9dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
; GCN-LABEL: v_andn2_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_and_b32_e32 v0, v0, v1
+; GCN-NEXT: v_bfi_b32 v0, v1, 0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
; GCN-LABEL: v_andn2_i32_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, s2, v0
+; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i32_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
; GCN-LABEL: v_andn2_i32_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s0, s2
-; GCN-NEXT: v_and_b32_e32 v0, s0, v0
+; GCN-NEXT: v_bfi_b32 v0, s2, 0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i32_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b32 s0, s2
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, 0, v0
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v2, v2
-; GCN-NEXT: v_not_b32_e32 v3, v3
-; GCN-NEXT: v_and_b32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v1, v1, v3
+; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0
+; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
-; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_and_b32_e32 v0, s2, v0
-; GCN-NEXT: v_and_b32_e32 v1, s3, v1
+; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
+; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..fd329e230e78b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -396,8 +396,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
@@ -784,19 +783,17 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX6-LABEL: v_fshl_v2i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX6-NEXT: v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0
; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5
+; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1
@@ -1184,38 +1181,34 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX6-LABEL: v_fshl_v4i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2
; GFX6-NEXT: v_and_b32_e32 v9, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
; GFX6-NEXT: v_bfe_u32 v9, v1, 1, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v6
-; GFX6-NEXT: v_not_b32_e32 v6, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v6, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: v_and_b32_e32 v3, 7, v7
-; GFX6-NEXT: v_not_b32_e32 v6, v7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4
; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v7, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v8
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: v_and_b32_e32 v4, 7, v8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v8, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5
@@ -5023,10 +5016,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6-LABEL: v_fshl_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5036,10 +5028,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8-LABEL: v_fshl_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_not_b32_e32 v4, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5049,10 +5040,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9-LABEL: v_fshl_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_not_b32_e32 v4, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5062,12 +5052,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-LABEL: v_fshl_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v5, v4
; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5075,16 +5064,14 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX11-LABEL: v_fshl_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v5, v4
; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5204,10 +5191,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX6-LABEL: v_fshl_i64_ssv:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v0
; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v3
; GFX6-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5216,10 +5202,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX8-LABEL: v_fshl_i64_ssv:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
; GFX8-NEXT: v_or_b32_e32 v0, v1, v3
; GFX8-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5228,10 +5213,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX9-LABEL: v_fshl_i64_ssv:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
; GFX9-NEXT: v_or_b32_e32 v0, v1, v3
; GFX9-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5239,11 +5223,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX10-LABEL: v_fshl_i64_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 63, v0
+; GFX10-NEXT: v_bfi_b32 v2, v0, 0, 63
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
@@ -5251,16 +5234,14 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX11-LABEL: v_fshl_i64_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 63, v0
+; GFX11-NEXT: v_bfi_b32 v2, v0, 0, 63
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5466,18 +5447,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6-LABEL: v_fshl_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1
-; GFX6-NEXT: v_not_b32_e32 v8, v8
-; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX6-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT: v_not_b32_e32 v4, v10
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5487,18 +5466,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8-LABEL: v_fshl_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX8-NEXT: v_not_b32_e32 v8, v8
-; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX8-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT: v_not_b32_e32 v4, v10
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5508,18 +5485,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9-LABEL: v_fshl_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v8, v8
-; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT: v_not_b32_e32 v4, v10
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
; GFX9-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5529,18 +5504,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-LABEL: v_fshl_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
-; GFX10-NEXT: v_not_b32_e32 v11, v10
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX10-NEXT: v_bfi_b32 v8, v8, 0, 63
+; GFX10-NEXT: v_and_b32_e32 v11, 63, v10
+; GFX10-NEXT: v_bfi_b32 v10, v10, 0, 63
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5550,20 +5523,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX11-LABEL: v_fshl_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
-; GFX11-NEXT: v_not_b32_e32 v11, v10
; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX11-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX11-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX11-NEXT: v_bfi_b32 v8, v8, 0, 63
+; GFX11-NEXT: v_and_b32_e32 v11, 63, v10
+; GFX11-NEXT: v_bfi_b32 v10, v10, 0, 63
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
; GFX11-NEXT: v_or_b32_e32 v1, v1, v5
@@ -5818,32 +5789,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15
-; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15
+; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v16
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, 0xffffffc0, v16
; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9
-; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15
-; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17
+; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v16
+; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v16
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v18
; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
; GFX6-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1
+; GFX6-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6
-; GFX6-NEXT: v_not_b32_e32 v4, v8
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4
-; GFX6-NEXT: v_not_b32_e32 v16, 63
+; GFX6-NEXT: v_bfi_b32 v14, v8, 0, v15
+; GFX6-NEXT: v_not_b32_e32 v17, 63
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14
-; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16
+; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v17
; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14
; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6
; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14
@@ -5867,32 +5838,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-LABEL: v_fshl_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15
+; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v16
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffffc0, v16
; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v18, v[0:1]
; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
; GFX8-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6
-; GFX8-NEXT: v_not_b32_e32 v4, v8
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4
-; GFX8-NEXT: v_not_b32_e32 v16, 63
+; GFX8-NEXT: v_bfi_b32 v14, v8, 0, v15
+; GFX8-NEXT: v_not_b32_e32 v17, 63
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v17
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3]
@@ -5916,27 +5887,27 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-LABEL: v_fshl_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15
-; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15
+; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX9-NEXT: v_sub_u32_e32 v9, 64, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1]
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v4, v8
; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4
+; GFX9-NEXT: v_bfi_b32 v14, v8, 0, v15
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1
; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14
; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14
@@ -5963,99 +5934,96 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-LABEL: v_fshl_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX10-NEXT: v_not_b32_e32 v12, v8
+; GFX10-NEXT: v_and_b32_e32 v17, 0x7f, v8
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v12
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX10-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7]
+; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v17
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3]
+; GFX10-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5
-; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX10-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
-; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX10-NEXT: v_or_b32_e32 v11, v11, v9
-; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18
-; GFX10-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX10-NEXT: v_or_b32_e32 v10, v15, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19
-; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v18
+; GFX10-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17
+; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18
+; GFX10-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10]
+; GFX10-NEXT: v_or_b32_e32 v11, v11, v7
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5]
+; GFX10-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v18
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10]
+; GFX10-NEXT: v_or_b32_e32 v0, v6, v15
+; GFX10-NEXT: v_or_b32_e32 v6, v7, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10]
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
-; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v1, s5
+; GFX10-NEXT: v_or_b32_e32 v0, v13, v4
+; GFX10-NEXT: v_or_b32_e32 v1, v9, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX11-NEXT: v_not_b32_e32 v12, v8
+; GFX11-NEXT: v_and_b32_e32 v17, 0x7f, v8
; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v12
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19
-; GFX11-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
-; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX11-NEXT: v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX11-NEXT: v_or_b32_e32 v10, v15, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18
+; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX11-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v17
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3]
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17
+; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v18
+; GFX11-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18
+; GFX11-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10]
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v7
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5]
+; GFX11-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v18
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10]
+; GFX11-NEXT: v_or_b32_e32 v0, v6, v15
+; GFX11-NEXT: v_or_b32_e32 v6, v7, v16
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v0, v12, v4
-; GFX11-NEXT: v_or_b32_e32 v1, v7, v5
+; GFX11-NEXT: v_or_b32_e32 v0, v13, v4
+; GFX11-NEXT: v_or_b32_e32 v1, v9, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
ret i128 %result
@@ -6064,264 +6032,260 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v8
; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1
-; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7
-; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7
-; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7
+; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v8
+; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0xffffffc0, v8
+; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v8
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
; GFX6-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT: v_not_b32_e32 v0, v0
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v10
+; GFX6-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX6-NEXT: s_mov_b32 s8, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_mov_b32_e32 v4, s3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX6-NEXT: s_lshl_b32 s9, s6, 31
-; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v7
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11
-; GFX6-NEXT: v_not_b32_e32 v8, 63
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7
+; GFX6-NEXT: v_not_b32_e32 v9, 63
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7
; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v9
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8
-; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v9
+; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v10, v0
; GFX6-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v11, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v8
; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffffc0, v8
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1]
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
; GFX8-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT: v_not_b32_e32 v0, v0
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v10, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX8-NEXT: s_mov_b32 s8, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX8-NEXT: s_lshl_b32 s9, s6, 31
-; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11
-; GFX8-NEXT: v_not_b32_e32 v8, 63
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7
+; GFX8-NEXT: v_not_b32_e32 v9, 63
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v9
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v9, s[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v11, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX9-NEXT: v_sub_u32_e32 v1, 64, v8
; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1]
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT: v_not_b32_e32 v0, v0
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX9-NEXT: s_lshl_b32 s9, s6, 31
-; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v7
; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10
+; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v7
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v8, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v9, v0
; GFX9-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX9-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX9-NEXT: v_or_b32_e32 v3, v10, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i128_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT: v_not_b32_e32 v6, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 0x7f, v0
+; GFX10-NEXT: v_bfi_b32 v12, v0, 0, 0x7f
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
; GFX10-NEXT: s_lshl_b32 s9, s6, 31
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v6
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11
+; GFX10-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3]
+; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12
; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1]
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v13
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7]
-; GFX10-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX10-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9]
+; GFX10-NEXT: v_or_b32_e32 v4, v4, v2
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_or_b32_e32 v9, v1, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX10-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i128_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT: v_not_b32_e32 v6, v0
-; GFX11-NEXT: s_lshl_b32 s9, s6, 31
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT: v_and_b32_e32 v11, 0x7f, v0
+; GFX11-NEXT: v_bfi_b32 v12, v0, 0, 0x7f
; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_lshl_b32 s9, s6, 31
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12
; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v13
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7]
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_add_nc_u32 v13, 0xffffffc0, v12
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v11
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0
-; GFX11-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-NEXT: v_or_b32_e32 v9, v1, v10
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v9
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -7445,185 +7409,183 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-LABEL: v_fshl_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23
-; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1
-; GFX6-NEXT: v_not_b32_e32 v16, v16
-; GFX6-NEXT: v_or_b32_e32 v21, v17, v21
-; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10
-; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1
-; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v17
-; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24
-; GFX6-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT: v_not_b32_e32 v18, 63
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19
+; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18
+; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23
+; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX6-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1
+; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1
+; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX6-NEXT: v_or_b32_e32 v11, v11, v22
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16
+; GFX6-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16
+; GFX6-NEXT: v_or_b32_e32 v16, v10, v21
+; GFX6-NEXT: v_or_b32_e32 v21, v11, v22
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19
+; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14
+; GFX6-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1
+; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14
+; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14
; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16
-; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24
-; GFX6-NEXT: v_not_b32_e32 v25, 63
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v16
-; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25
-; GFX6-NEXT: v_or_b32_e32 v19, v19, v17
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0
-; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v26, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17
-; GFX6-NEXT: v_or_b32_e32 v3, v16, v19
-; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25
-; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14
-; GFX6-NEXT: v_not_b32_e32 v8, v20
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25
-; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX6-NEXT: v_or_b32_e32 v6, v17, v6
-; GFX6-NEXT: v_or_b32_e32 v7, v19, v7
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT: v_or_b32_e32 v15, v15, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT: v_or_b32_e32 v0, v24, v0
+; GFX6-NEXT: v_or_b32_e32 v1, v25, v1
+; GFX6-NEXT: v_or_b32_e32 v3, v23, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v10
+; GFX6-NEXT: v_or_b32_e32 v7, v7, v11
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23
-; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v16, v16
-; GFX8-NEXT: v_or_b32_e32 v21, v17, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10
-; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v17
-; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24
-; GFX8-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT: v_not_b32_e32 v18, 63
+; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18
+; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX8-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18
+; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3]
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7]
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
+; GFX8-NEXT: v_or_b32_e32 v16, v10, v21
+; GFX8-NEXT: v_or_b32_e32 v21, v11, v22
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15]
+; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18
+; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9]
; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v25, 63
-; GFX8-NEXT: v_or_b32_e32 v18, v18, v16
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25
-; GFX8-NEXT: v_or_b32_e32 v19, v19, v17
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v26, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v3, v16, v19
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25
-; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14
-; GFX8-NEXT: v_not_b32_e32 v8, v20
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25
-; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX8-NEXT: v_or_b32_e32 v6, v17, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v19, v7
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11]
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v24, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v25, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v23, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v8
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v10
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v2i128:
@@ -7632,17 +7594,17 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16
; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23
-; GFX9-NEXT: v_not_b32_e32 v16, v16
+; GFX9-NEXT: v_mov_b32_e32 v24, 0x7f
; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24
+; GFX9-NEXT: v_bfi_b32 v25, v16, 0, v24
+; GFX9-NEXT: v_sub_u32_e32 v16, 64, v25
; GFX9-NEXT: v_or_b32_e32 v21, v17, v21
; GFX9-NEXT: v_or_b32_e32 v22, v18, v22
; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX9-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX9-NEXT: v_or_b32_e32 v18, v18, v16
; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23
@@ -7650,48 +7612,47 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX9-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX9-NEXT: v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20
-; GFX9-NEXT: v_or_b32_e32 v0, v25, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT: v_or_b32_e32 v0, v26, v2
; GFX9-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
+; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v20
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX9-NEXT: v_sub_u32_e32 v3, 64, v17
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX9-NEXT: v_or_b32_e32 v3, v16, v19
+; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v17
; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc
-; GFX9-NEXT: v_not_b32_e32 v8, v20
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8
+; GFX9-NEXT: v_bfi_b32 v13, v20, 0, v24
; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5
; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5]
@@ -7709,68 +7670,66 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v4, v17, v4
+; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
; GFX9-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX9-NEXT: v_or_b32_e32 v6, v16, v6
+; GFX9-NEXT: v_or_b32_e32 v6, v17, v6
; GFX9-NEXT: v_or_b32_e32 v7, v12, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT: v_not_b32_e32 v21, v16
+; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v16
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v27
-; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21
-; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27
-; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v29, v16, 0, 0x7f
+; GFX10-NEXT: v_sub_nc_u32_e32 v21, 64, v19
+; GFX10-NEXT: v_add_nc_u32_e32 v25, 0xffffffc0, v19
+; GFX10-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3]
; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28
-; GFX10-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX10-NEXT: v_or_b32_e32 v18, v18, v21
-; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28
-; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v0, v19, v22
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v21, v23, v25
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27
-; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1]
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v29
+; GFX10-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX10-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9]
+; GFX10-NEXT: v_or_b32_e32 v21, v21, v23
+; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29
+; GFX10-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v0, v22, v24
+; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
+; GFX10-NEXT: v_or_b32_e32 v19, v25, v27
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v29
+; GFX10-NEXT: v_or_b32_e32 v22, v26, v28
; GFX10-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v21, s5
-; GFX10-NEXT: v_or_b32_e32 v22, v24, v26
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v29, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v19, v22, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v19, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v22, s5
; GFX10-NEXT: v_cndmask_b32_e64 v22, v23, v3, s4
; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20
-; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v16, v2
-; GFX10-NEXT: v_not_b32_e32 v16, v20
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v23
+; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23
+; GFX10-NEXT: v_bfi_b32 v20, v20, 0, 0x7f
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s5
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v16
-; GFX10-NEXT: v_or_b32_e32 v1, v17, v3
-; GFX10-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX10-NEXT: v_or_b32_e32 v0, v30, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX10-NEXT: v_or_b32_e32 v10, v2, v10
; GFX10-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20
@@ -7807,96 +7766,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX11-LABEL: v_fshl_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT: v_not_b32_e32 v21, v16
+; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v16
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1]
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v27
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27
+; GFX11-NEXT: v_bfi_b32 v29, v16, 0, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19
; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21
-; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v18, v18, v21
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28
-; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v22
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v28
-; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v21, v23, v25
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v29
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v29
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v21, 64, v19
+; GFX11-NEXT: v_dual_cndmask_b32 v18, 0, v18 :: v_dual_add_nc_u32 v25, 0xffffffc0, v19
+; GFX11-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29
+; GFX11-NEXT: v_or_b32_e32 v19, v25, v27
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v0, v22, v24
+; GFX11-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
+; GFX11-NEXT: v_or_b32_e32 v22, v26, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0
; GFX11-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, v24, v26
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v21, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_cndmask_b32_e64 v21, v29, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v19, v22, s1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
+; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v19, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11]
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v22, s1
; GFX11-NEXT: v_cndmask_b32_e64 v22, v23, v3, s0
; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23
-; GFX11-NEXT: v_or_b32_e32 v0, v16, v2
-; GFX11-NEXT: v_not_b32_e32 v16, v20
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo
+; GFX11-NEXT: v_bfi_b32 v20, v20, 0, 0x7f
; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v23
-; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
-; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v16
-; GFX11-NEXT: v_or_b32_e32 v1, v17, v3
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23
+; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v30, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23
; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
-; GFX11-NEXT: v_or_b32_e32 v10, v2, v10
; GFX11-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20
+; GFX11-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
+; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[8:9]
+; GFX11-NEXT: v_or_b32_e32 v10, v2, v10
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
; GFX11-NEXT: v_or_b32_e32 v2, v21, v24
; GFX11-NEXT: v_or_b32_e32 v11, v3, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v12, 0, v12 :: v_dual_cndmask_b32 v21, v4, v10
; GFX11-NEXT: v_lshrrev_b64 v[3:4], v26, v[14:15]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
; GFX11-NEXT: v_or_b32_e32 v10, v16, v18
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
; GFX11-NEXT: v_or_b32_e32 v16, v17, v19
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1
; GFX11-NEXT: v_lshrrev_b64 v[10:11], v20, v[14:15]
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v8, s2
-; GFX11-NEXT: v_or_b32_e32 v3, v22, v25
; GFX11-NEXT: v_cndmask_b32_e64 v8, v4, v9, s2
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v11, s1
+; GFX11-NEXT: v_or_b32_e32 v3, v22, v25
; GFX11-NEXT: v_or_b32_e32 v4, v12, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v5, v13, v8
; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v7, v7, v10
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 238cc06fc7f7c..d1ba24673043d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -398,8 +398,7 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
@@ -785,19 +784,17 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX6-LABEL: v_fshr_v2i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX6-NEXT: v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
@@ -1187,40 +1184,36 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX6-LABEL: v_fshr_v4i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 8, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2
; GFX6-NEXT: v_and_b32_e32 v10, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v7
-; GFX6-NEXT: v_not_b32_e32 v7, v7
-; GFX6-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX6-NEXT: v_bfi_b32 v7, v7, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3
; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7
-; GFX6-NEXT: v_not_b32_e32 v7, v8
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_and_b32_e32 v3, 7, v8
-; GFX6-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX6-NEXT: v_bfi_b32 v7, v8, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX6-NEXT: v_not_b32_e32 v4, v9
; GFX6-NEXT: v_and_b32_e32 v3, 7, v9
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v9, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5
@@ -5052,8 +5045,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: v_not_b32_e32 v5, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX6-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
@@ -5065,8 +5057,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: v_not_b32_e32 v5, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX8-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5078,8 +5069,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_not_b32_e32 v5, v4
-; GFX9-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX9-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5090,12 +5080,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-LABEL: v_fshr_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v5, v4
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5103,16 +5092,14 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX11-LABEL: v_fshr_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v5, v4
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5228,9 +5215,8 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
; GFX6-LABEL: v_fshr_i64_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_not_b32_e32 v1, v0
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX6-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0
@@ -5240,9 +5226,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX8-LABEL: v_fshr_i64_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_not_b32_e32 v1, v0
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX8-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
@@ -5252,9 +5237,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX9-LABEL: v_fshr_i64_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_not_b32_e32 v1, v0
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX9-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
@@ -5264,29 +5248,27 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX10-LABEL: v_fshr_i64_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT: v_bfi_b32 v1, v0, 0, 63
+; GFX10-NEXT: v_and_b32_e32 v2, 63, v0
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i64_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX11-NEXT: v_bfi_b32 v1, v0, 0, 63
+; GFX11-NEXT: v_and_b32_e32 v2, 63, v0
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
%cast = bitcast i64 %result to <2 x float>
@@ -5492,15 +5474,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: v_not_b32_e32 v9, v8
-; GFX6-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX6-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v10
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
@@ -5513,15 +5493,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: v_not_b32_e32 v9, v8
-; GFX8-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX8-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v10
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
@@ -5534,15 +5512,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_not_b32_e32 v9, v8
-; GFX9-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX9-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v10
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
@@ -5554,16 +5530,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-LABEL: v_fshr_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
-; GFX10-NEXT: v_not_b32_e32 v11, v10
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX10-NEXT: v_bfi_b32 v11, v10, 0, 63
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
@@ -5575,17 +5549,15 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX11-LABEL: v_fshr_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
-; GFX11-NEXT: v_not_b32_e32 v11, v10
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX11-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX11-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX11-NEXT: v_bfi_b32 v11, v10, 0, 63
; GFX11-NEXT: v_and_b32_e32 v10, 63, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
@@ -5848,8 +5820,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v8
-; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX6-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15
; GFX6-NEXT: v_not_b32_e32 v16, 63
; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0
@@ -5897,8 +5869,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v8
-; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX8-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15
; GFX8-NEXT: v_not_b32_e32 v16, 63
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
@@ -5946,8 +5918,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v8
-; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX9-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
@@ -5990,107 +5962,103 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-LABEL: v_fshr_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1
+; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 31, v1
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_and_b32_e32 v21, 0x7f, v8
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v9
-; GFX10-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v21
-; GFX10-NEXT: v_sub_nc_u32_e32 v12, 64, v20
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21
-; GFX10-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
+; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7]
-; GFX10-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX10-NEXT: v_or_b32_e32 v11, v13, v11
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v21
-; GFX10-NEXT: v_or_b32_e32 v12, v15, v17
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19
+; GFX10-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
+; GFX10-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v14, v16
+; GFX10-NEXT: v_or_b32_e32 v10, v15, v17
; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v18, v0, s5
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v19, v12, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v8, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5
+; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v9
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshr_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1
+; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 31, v1
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v9
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v12, 64, v20
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0x7f, v8
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1]
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20
-; GFX11-NEXT: v_or_b32_e32 v11, v13, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21
-; GFX11-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v21
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3]
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19
; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v20
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
; GFX11-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX11-NEXT: v_or_b32_e32 v12, v15, v17
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v10, v15, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v18, v0, s1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v19, v12, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v13, v4, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
-; GFX11-NEXT: v_or_b32_e32 v0, v8, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v12, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v1, v7, v5
; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v9
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
ret i128 %result
@@ -6099,12 +6067,12 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshr_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_not_b32_e32 v1, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
; GFX6-NEXT: s_mov_b32 s1, 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
; GFX6-NEXT: v_not_b32_e32 v8, 63
@@ -6152,12 +6120,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX8-LABEL: v_fshr_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_not_b32_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
; GFX8-NEXT: s_mov_b32 s1, 0
-; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
; GFX8-NEXT: v_not_b32_e32 v8, 63
@@ -6205,12 +6173,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX9-LABEL: v_fshr_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_not_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
@@ -6257,101 +6225,99 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX10-LABEL: v_fshr_i128_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_lshr_b32 s8, s1, 31
-; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0
-; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v13
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11]
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX10-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX10-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v11
+; GFX10-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX10-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX10-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v7, v9
+; GFX10-NEXT: v_or_b32_e32 v7, v8, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX10-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i128_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: s_mov_b32 s9, 0
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
; GFX11-NEXT: s_lshr_b32 s8, s1, 31
-; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0
-; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT: s_mov_b32 s9, 0
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v11
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v13
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12
+; GFX11-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
+; GFX11-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX11-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v7, v9
+; GFX11-NEXT: v_or_b32_e32 v7, v8, v10
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX11-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v9
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -7486,226 +7452,224 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1
+; GFX6-NEXT: v_mov_b32_e32 v18, 0x7f
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX6-NEXT: v_bfi_b32 v19, v16, 0, v18
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v16
-; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
-; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX6-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25
-; GFX6-NEXT: v_or_b32_e32 v24, v1, v22
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0
-; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25
-; GFX6-NEXT: v_not_b32_e32 v26, 63
-; GFX6-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26
-; GFX6-NEXT: v_or_b32_e32 v22, v22, v1
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0
+; GFX6-NEXT: v_not_b32_e32 v17, 63
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19
+; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v17
+; GFX6-NEXT: v_lshr_b64 v[23:24], v[21:22], v23
+; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[21:22], v19
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[21:22], v27
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
-; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0
-; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX6-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX6-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v16
+; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v2, v17
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], v21
; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v0, v16, v8
-; GFX6-NEXT: v_or_b32_e32 v1, v17, v9
+; GFX6-NEXT: v_or_b32_e32 v21, v2, v21
+; GFX6-NEXT: v_or_b32_e32 v22, v3, v22
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v16
+; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v18
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; GFX6-NEXT: v_or_b32_e32 v0, v24, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v25, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v19, v8
+; GFX6-NEXT: v_or_b32_e32 v3, v23, v9
; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX6-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v20
-; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16
-; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26
-; GFX6-NEXT: v_or_b32_e32 v10, v4, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v5, v11
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16
+; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10
+; GFX6-NEXT: v_lshl_b64 v[18:19], v[6:7], v16
; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v18
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX6-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX6-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5]
+; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26
-; GFX6-NEXT: v_or_b32_e32 v16, v4, v6
-; GFX6-NEXT: v_or_b32_e32 v19, v5, v7
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_add_i32_e32 v17, vcc, v6, v17
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v6
+; GFX6-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5]
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v6
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v6
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8
+; GFX6-NEXT: v_or_b32_e32 v8, v6, v8
+; GFX6-NEXT: v_or_b32_e32 v9, v7, v9
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX6-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT: v_or_b32_e32 v4, v16, v6
; GFX6-NEXT: v_or_b32_e32 v5, v18, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v7, v9, v11
+; GFX6-NEXT: v_or_b32_e32 v6, v10, v8
+; GFX6-NEXT: v_or_b32_e32 v7, v11, v9
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v18, 0x7f
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX8-NEXT: v_bfi_b32 v19, v16, 0, v18
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v16
-; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX8-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25
-; GFX8-NEXT: v_or_b32_e32 v24, v1, v22
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
-; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v26, 63
-; GFX8-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26
-; GFX8-NEXT: v_or_b32_e32 v22, v22, v1
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
+; GFX8-NEXT: v_not_b32_e32 v17, 63
+; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v17
+; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[21:22]
+; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[21:22]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v27, v[21:22]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX8-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX8-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v2, v17
+; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[10:11]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v0, v16, v8
-; GFX8-NEXT: v_or_b32_e32 v1, v17, v9
+; GFX8-NEXT: v_or_b32_e32 v21, v2, v21
+; GFX8-NEXT: v_or_b32_e32 v22, v3, v22
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[10:11]
+; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v24, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v25, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v8
+; GFX8-NEXT: v_or_b32_e32 v3, v23, v9
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v20
-; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26
-; GFX8-NEXT: v_or_b32_e32 v10, v4, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v5, v11
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[6:7]
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v18
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26
-; GFX8-NEXT: v_or_b32_e32 v16, v4, v6
-; GFX8-NEXT: v_or_b32_e32 v19, v5, v7
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v6, v17
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, v[14:15]
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[12:13]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15]
+; GFX8-NEXT: v_or_b32_e32 v8, v6, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v7, v9
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15]
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX8-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v4, v16, v6
; GFX8-NEXT: v_or_b32_e32 v5, v18, v7
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v7, v9, v11
+; GFX8-NEXT: v_or_b32_e32 v6, v10, v8
+; GFX8-NEXT: v_or_b32_e32 v7, v11, v9
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v19, 0x7f
; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX9-NEXT: v_bfi_b32 v23, v16, 0, v19
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v16
-; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v23
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
-; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX9-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25
-; GFX9-NEXT: v_or_b32_e32 v24, v1, v22
+; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX9-NEXT: v_and_b32_e32 v26, 0x7f, v16
+; GFX9-NEXT: v_or_b32_e32 v24, v0, v21
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v26
+; GFX9-NEXT: v_or_b32_e32 v25, v1, v22
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
-; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_lshrrev_b64 v[21:22], v26, v[8:9]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX9-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v23
; GFX9-NEXT: v_or_b32_e32 v22, v22, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v24, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v25, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v26
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v23, v[17:18]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v1, v22, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
@@ -7713,9 +7677,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_or_b32_e32 v1, v17, v9
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
+; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v19
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v20
-; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4
; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16
; GFX9-NEXT: v_or_b32_e32 v2, v2, v10
; GFX9-NEXT: v_or_b32_e32 v3, v3, v11
@@ -7760,14 +7723,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-LABEL: v_fshr_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v17, v16
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17
+; GFX10-NEXT: v_bfi_b32 v25, v16, 0, 0x7f
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
+; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25
; GFX10-NEXT: v_or_b32_e32 v2, v2, v17
; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25
@@ -7776,54 +7737,54 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25
+; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25
; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, v18, v22
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v26
; GFX10-NEXT: v_or_b32_e32 v21, v17, v21
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9]
+; GFX10-NEXT: v_bfi_b32 v25, v20, 0, 0x7f
; GFX10-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26
; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v3, s4
+; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX10-NEXT: v_or_b32_e32 v16, v16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4
; GFX10-NEXT: v_or_b32_e32 v17, v17, v19
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v26
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT: v_not_b32_e32 v16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v5
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4
-; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16
+; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX10-NEXT: v_or_b32_e32 v6, v6, v10
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s4
; GFX10-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo
-; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
-; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
+; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v0, v23, v0
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
+; GFX10-NEXT: v_or_b32_e32 v8, v2, v8
; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
-; GFX10-NEXT: v_or_b32_e32 v8, v2, v8
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
; GFX10-NEXT: v_or_b32_e32 v2, v21, v26
; GFX10-NEXT: v_or_b32_e32 v9, v3, v9
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20
; GFX10-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15]
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20
; GFX10-NEXT: v_or_b32_e32 v8, v16, v18
; GFX10-NEXT: v_or_b32_e32 v16, v17, v19
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
@@ -7851,99 +7812,95 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX11-LABEL: v_fshr_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v17, v16
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17
+; GFX11-NEXT: v_bfi_b32 v25, v16, 0, 0x7f
; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, v2, v17
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1]
; GFX11-NEXT: v_and_b32_e32 v26, 0x7f, v16
; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v24, 0, v24 :: v_dual_add_nc_u32 v19, 0xffffffc0, v25
-; GFX11-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
+; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_cndmask_b32 v24, 0, v24
+; GFX11-NEXT: v_bfi_b32 v25, v20, 0, 0x7f
; GFX11-NEXT: v_or_b32_e32 v22, v18, v22
; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26
; GFX11-NEXT: v_or_b32_e32 v21, v17, v21
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1]
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_cndmask_b32 v21, v0, v21 :: v_dual_cndmask_b32 v22, v1, v22
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v16, v16, v18
; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
-; GFX11-NEXT: v_or_b32_e32 v17, v17, v19
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0
+; GFX11-NEXT: v_or_b32_e32 v17, v17, v19
; GFX11-NEXT: v_cndmask_b32_e64 v22, v22, v3, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11]
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v5
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-NEXT: v_not_b32_e32 v16, v20
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26
+; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v16 :: v_dual_cndmask_b32 v1, v1, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo
; GFX11-NEXT: v_or_b32_e32 v6, v6, v10
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_and_b32 v20, 0x7f, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v26, 0, v2 :: v_dual_and_b32 v25, 0x7f, v16
; GFX11-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX11-NEXT: v_or_b32_e32 v0, v23, v0
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
-; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
-; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v1, v24, v1
; GFX11-NEXT: v_or_b32_e32 v8, v2, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX11-NEXT: v_or_b32_e32 v2, v21, v26
; GFX11-NEXT: v_or_b32_e32 v9, v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15]
; GFX11-NEXT: v_or_b32_e32 v8, v16, v18
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v16, v17, v19
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1
; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15]
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v12, s2
; GFX11-NEXT: v_or_b32_e32 v3, v22, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s1
; GFX11-NEXT: v_or_b32_e32 v4, v10, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v11, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v5, v11, v12
; GFX11-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v7, v7, v9
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index cae833b0d64e3..0e1bbbd1ea92b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -123,9 +123,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -143,11 +142,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -302,9 +300,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -319,9 +316,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_and_b32 s1, s4, 0xffff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -393,9 +389,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -410,9 +405,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -482,12 +476,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -505,11 +498,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -576,10 +568,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -597,11 +588,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -668,10 +658,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -689,11 +678,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -820,19 +808,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v4i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_lshr_b32 s0, s3, 1
+; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -846,19 +833,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_lshr_b32 s0, s3, 1
+; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1090,8 +1076,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1117,8 +1102,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1228,8 +1212,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1246,17 +1229,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1356,16 +1338,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1382,16 +1363,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1479,15 +1459,14 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1501,19 +1480,18 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: s_lshr_b32 s0, s2, 1
+; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1601,16 +1579,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1627,16 +1604,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1910,14 +1886,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v8i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_lshr_b32 s4, s3, 1
+; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -1926,7 +1901,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -1942,14 +1917,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_lshr_b32 s4, s3, 1
+; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -1958,7 +1932,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -2263,17 +2237,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -2294,23 +2267,22 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -2441,23 +2413,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2478,23 +2449,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -2628,7 +2598,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -2636,7 +2605,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -2658,9 +2627,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2668,7 +2636,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -2773,13 +2741,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-LABEL: insertelement_v_v8i16_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshr_b32 s4, s2, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -2789,7 +2756,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -2805,14 +2772,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: s_lshr_b32 s4, s2, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2787,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -2935,7 +2901,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -2943,7 +2908,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -2959,15 +2924,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2975,7 +2939,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3283,19 +3247,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: s_and_b32 s0, s3, 1
+; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, 16
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v12, v0
-; GFX8-NEXT: v_and_b32_e32 v12, s0, v12
+; GFX8-NEXT: v_bfi_b32 v12, s0, 0, v12
; GFX8-NEXT: v_or_b32_e32 v12, s1, v12
; GFX8-NEXT: v_movreld_b32_e32 v0, v12
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -3310,17 +3273,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s3, 1
+; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: v_movreld_b32_e32 v2, v0
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0
@@ -3644,21 +3606,20 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v6, s21
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v7, s22
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v9, s23
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
@@ -3705,20 +3666,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v6, s21
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v7, s22
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v9, s23
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v9, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s16
; GFX7-NEXT: v_mov_b32_e32 v1, s17
@@ -3936,20 +3896,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s18
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v10, s19
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
@@ -3996,20 +3955,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v9, s18
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v10, s19
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v9, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
@@ -4216,7 +4174,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v12, 0
@@ -4231,7 +4188,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v15, v0, v15
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
@@ -4263,9 +4220,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4278,7 +4234,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v11, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v11
; GFX7-NEXT: v_or_b32_e32 v11, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
@@ -4452,14 +4408,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_mov_b32_e32 v13, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v11, 16
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v13, v3
-; GFX8-NEXT: v_and_b32_e32 v13, s0, v13
+; GFX8-NEXT: v_bfi_b32 v13, s0, 0, v13
; GFX8-NEXT: v_or_b32_e32 v2, v13, v2
; GFX8-NEXT: v_movreld_b32_e32 v3, v2
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
@@ -4474,17 +4429,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s2, 1
+; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, s0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_movreld_b32_e32 v3, v0
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
@@ -4611,7 +4565,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v13, 0
@@ -4626,7 +4579,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v16, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc
@@ -4654,13 +4607,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4673,7 +4625,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v12, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index fe7d421d27f84..4598bcc04a505 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_and_b32 s1, s4, 0xff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v8i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_lshr_b32 s0, s3, 2
+; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_lshr_b32 s0, s3, 2
+; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_and_b32 s2, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1
; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshr_b32 s0, s2, 2
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: s_lshr_b32 s0, s2, 2
+; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v16i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_lshr_b32 s4, s3, 2
+; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_lshr_b32 s4, s3, 2
+; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_and_b32 s1, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-LABEL: insertelement_v_v16i8_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshr_b32 s4, s2, 2
+; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: s_lshr_b32 s4, s2, 2
+; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index e22cee87e17da..d0e32fc205144 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -48,8 +48,7 @@ entry:
}
; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
-; GCN: v_not_b32
-; GCN: v_and_b32
+; GCN: v_bfi_b32
define amdgpu_kernel void @vector_andn2_i32_s_v_one_use(
ptr addrspace(1) %r0, i32 %s) {
entry:
@@ -61,8 +60,7 @@ entry:
}
; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use
-; GCN: s_not_b32
-; GCN: v_and_b32
+; GCN: v_bfi_b32
define amdgpu_kernel void @vector_andn2_i32_v_s_one_use(
ptr addrspace(1) %r0, i32 %s) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 089d6f5bf57ca..1492119a6022d 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -77,8 +77,7 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
; GCN-NEXT: s_mov_b32 s9, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_bfi_b32 v0, v0, 0, 1
; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GCN-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index 187f19f653858..52d4780005aad 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -99,9 +99,8 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
;
; GFX950-GISEL-LABEL: and_not_and_and:
; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v1
; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX950-GISEL-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-LABEL: and_not_and_and:
@@ -111,10 +110,9 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
;
; GFX1250-GISEL-LABEL: and_not_and_and:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_not_b32_e32 v1, v1
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX1250-GISEL-NEXT: ; return to shader part epilog
%notb = xor i32 %b, -1
%and1 = and i32 %a, %c
@@ -268,9 +266,8 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
; GFX950-GISEL-LABEL: test_12_src_overflow:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_not_b32_e32 v3, v0
-; GFX950-GISEL-NEXT: v_not_b32_e32 v4, v2
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
-; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v3, v4
+; GFX950-GISEL-NEXT: v_bfi_b32 v2, v2, 0, v3
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xc8
; GFX950-GISEL-NEXT: ; return to shader part epilog
;
@@ -282,10 +279,9 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
; GFX1250-GISEL-LABEL: test_12_src_overflow:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_not_b32_e32 v3, v0
-; GFX1250-GISEL-NEXT: v_not_b32_e32 v4, v2
; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v3, v4
+; GFX1250-GISEL-NEXT: v_bfi_b32 v2, v2, 0, v3
; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xc8
; GFX1250-GISEL-NEXT: ; return to shader part epilog
%nota = xor i32 %a, -1
@@ -312,13 +308,12 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
;
; GFX950-GISEL-LABEL: test_100_src_overflow:
; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_bitop3_b32 v3, v2, v0, v2 bitop3:3
-; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX950-GISEL-NEXT: v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30
; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v1, v0
-; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v2
+; GFX950-GISEL-NEXT: v_bfi_b32 v3, v3, 0, v1
; GFX950-GISEL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT: v_bfi_b32 v0, v2, 0, v0
; GFX950-GISEL-NEXT: v_or3_b32 v0, v3, v4, v0
; GFX950-GISEL-NEXT: ; return to shader part epilog
;
@@ -333,16 +328,15 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
;
; GFX1250-GISEL-LABEL: test_100_src_overflow:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_bitop3_b32 v3, v2, v0, v2 bitop3:3
+; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX1250-GISEL-NEXT: v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v1, v0
-; GFX1250-GISEL-NEXT: v_not_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v4, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-GISEL-NEXT: v_bfi_b32 v1, v3, 0, v1
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v4, v2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v5
-; GFX1250-GISEL-NEXT: v_or3_b32 v0, v1, v2, v0
+; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v2, 0, v0
+; GFX1250-GISEL-NEXT: v_or3_b32 v0, v1, v3, v0
; GFX1250-GISEL-NEXT: ; return to shader part epilog
%or1 = or i32 %c, %a
%not1 = xor i32 %or1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 355f77acfd302..ba5ce8bb5fae7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -85,10 +85,8 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
-; SI-NEXT: v_not_b32_e32 v5, v5
-; SI-NEXT: v_not_b32_e32 v4, v4
-; SI-NEXT: v_and_b32_e32 v5, v3, v5
-; SI-NEXT: v_and_b32_e32 v4, v2, v4
+; SI-NEXT: v_bfi_b32 v5, v5, 0, v3
+; SI-NEXT: v_bfi_b32 v4, v4, 0, v2
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
More information about the llvm-commits
mailing list